In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv("Processed_ILPD.csv")
df

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65.0,0.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1.0
1,62.0,1.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,1.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,1.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1.0
4,72.0,1.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1.0
...,...,...,...,...,...,...,...,...,...,...,...
578,60.0,1.0,0.5,0.1,500.0,20.0,34.0,5.9,1.6,0.37,2.0
579,40.0,1.0,0.6,0.1,98.0,35.0,31.0,6.0,3.2,1.10,1.0
580,52.0,1.0,0.8,0.2,245.0,48.0,49.0,6.4,3.2,1.00,1.0
581,31.0,1.0,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1.0


In [15]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [16]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
standard_data = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
standardized_X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X.columns)
standardized_X_test = pd.DataFrame(standard_scaler.fit_transform(X_test), columns=X.columns)

min_max_scaler = preprocessing.MinMaxScaler()
minmax_data = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)
minmax_X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train), columns=X.columns)
minmax_X_test = pd.DataFrame(min_max_scaler.fit_transform(X_test), columns=X.columns)

max_abs_scaler = preprocessing.MaxAbsScaler()
maxabs_data = pd.DataFrame(max_abs_scaler.fit_transform(X), columns=X.columns)
maxabs_X_train = pd.DataFrame(max_abs_scaler.fit_transform(X_train), columns=X.columns)
maxabs_X_test = pd.DataFrame(max_abs_scaler.fit_transform(X_test), columns=X.columns)

robust_scaler = preprocessing.RobustScaler()
robust_data = pd.DataFrame(robust_scaler.fit_transform(X), columns=X.columns)
robust_X_train = pd.DataFrame(robust_scaler.fit_transform(X_train), columns=X.columns)
robust_X_test = pd.DataFrame(robust_scaler.fit_transform(X_test), columns=X.columns)

scaled_data = {
    'Standardized' : 
    {
        'data': standard_data,
        'train':standardized_X_train,
        'test':standardized_X_test
    },
    'Min_Max': 
    {
        'data':minmax_data,
        'train':minmax_X_train,
        'test':minmax_X_test
    },
    'Absolute_max': 
    {
        'data':maxabs_data,
        'train':maxabs_X_train,
        'test':maxabs_X_test
    },
    'Robust':
    {
        'data':robust_data,
        'train':robust_X_train,
        'test':robust_X_test
    },
}


In [17]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_classif

In [22]:
feature_selection = []
for name in scaled_data :
    if name!='Standardized' and name!='Robust':
        d1 = {}
        d1['Processing'] = name
        d1['Method'] = "Univariate Selection"
        chi_selector = SelectKBest(chi2, k='all')
        fit = chi_selector.fit(scaled_data[name]['data'], y)
        d1['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, fit.scores_), key = lambda x: x[1],reverse=True))
        feature_selection.append(d1)
    
    ###############################

    d2 = {}
    d2['Processing'] = name
    d2['Method'] = "Extra Tree Classifier"
    etc = ExtraTreesClassifier()
    etc.fit(scaled_data[name]['data'],y)
    d2['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, etc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d2)

    ###############################

    d3 = {}
    d3['Processing'] = name
    d3['Method'] = "Random Forest Classifier"
    rf = RandomForestClassifier(n_estimators=500, random_state=7)
    rf.fit(scaled_data[name]['data'], y)
    d3['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, rf.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d3)

    ###############################

    d4 = {}
    d4['Processing'] = name
    d4['Method'] = "LGBM Classifier"
    lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.0001,
                    num_leaves=32, colsample_bytree=0.2,                                           
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01,    
                    min_child_weight=40)
    lgbc.fit(scaled_data[name]['data'], y)
    d4['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, lgbc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d4)

    ###############################

    d5 = {}
    d5['Processing'] = name
    d5['Method'] = "Correlation Matrix"
    t = pd.concat([scaled_data[name]['data'], y], axis=1)
    corrmat = t.corr()
    d5['Features'] = dict(sorted(zip(corrmat.iloc[:-1,:]['Target'].index,abs(corrmat.iloc[:-1,:]['Target'].values)),key = lambda x: x[1],reverse=True)) 
    feature_selection.append(d5) 

    ###############################

    d6 = {}
    d6['Processing'] = name
    d6['Method'] = "ANOVA F-SCORES"
    anov_selector = SelectKBest(f_classif, k='all')
    fit = anov_selector.fit(scaled_data[name]['data'], y)
    d6['Features'] = dict(sorted(zip(scaled_data[name]['data'].columns, fit.scores_), key = lambda x: x[1], reverse=True))
    feature_selection.append(d6) 
    

In [23]:
feature_selection

[{'Processing': 'Standardized',
  'Method': 'Extra Tree Classifier',
  'Features': {'SGOT': 0.12156047734695095,
   'Age': 0.11819216852760422,
   'AP': 0.11460792822144186,
   'SGPT': 0.1127015368518351,
   'TB': 0.10934361990469309,
   'DB': 0.10416406902016756,
   'A/G': 0.10100578430359738,
   'ALB': 0.0974174136898441,
   'TP': 0.09523342602664678,
   'Gender': 0.025773576107219025}},
 {'Processing': 'Standardized',
  'Method': 'Random Forest Classifier',
  'Features': {'AP': 0.14899588733880265,
   'SGOT': 0.1309650405764519,
   'Age': 0.1300275973900262,
   'SGPT': 0.1288801113689817,
   'TB': 0.10280180724381034,
   'ALB': 0.09301895513979797,
   'TP': 0.08762906314052428,
   'DB': 0.08250370599508108,
   'A/G': 0.07545695218452728,
   'Gender': 0.019720879621996545}},
 {'Processing': 'Standardized',
  'Method': 'LGBM Classifier',
  'Features': {'TB': 151,
   'AP': 115,
   'SGOT': 91,
   'DB': 48,
   'ALB': 43,
   'Age': 20,
   'SGPT': 17,
   'A/G': 15,
   'Gender': 0,
   'TP':

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from chefboost import Chefboost as chef

In [None]:
results = []
best_result = {}
cross_best_result = {}
max_accuracy = 0
max_cross_acc = 0
for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        data = scaled_data[method['Processing']]['data']
        X_train = scaled_data[method['Processing']]['train']
        X_test = scaled_data[method['Processing']]['test']
        
        model = XGBClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy:
            max_accuracy = accuracy
            best_result = info
        if cross_acc>max_cross_acc:
            max_cross_acc = cross_acc
            cross_best_result = info

In [36]:
best_result

{'Processing': 'Absolute_max',
 'Method': 'Univariate Selection',
 'Features': ['DB', 'TB', 'SGPT'],
 'Cross Validation Accuracy': 69.13208649912332,
 'Accuracy': 74.35897435897436}

In [37]:
cross_best_result

{'Processing': 'Absolute_max',
 'Method': 'Univariate Selection',
 'Features': ['DB', 'TB', 'SGPT'],
 'Cross Validation Accuracy': 69.13208649912332,
 'Accuracy': 74.35897435897436}

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, random_state=100)

In [52]:
config = {'algorithm': 'C4.5'}
model = chef.fit(X_train, config = config, target_label = 'Target')

[INFO]:  4 CPU cores will be allocated in parallel running
That's why, the algorithm is set to Regression to handle the data set.
Regression  tree is going to be built...
-------------------------
finished in  3.701164484024048  seconds
-------------------------
Evaluate  train set
-------------------------
MAE:  0.24213161659513593
MSE:  0.2390915593705293
RMSE:  0.48896989618025494
RAE:  0.3564308480601514
RRSE:  1.0732720171988586
Mean:  1.293991416309013
MAE / Mean:  18.71199557766722 %
RMSE / Mean:  37.787723320066135 %


In [53]:
y_pred = []
for i in range(X_test.shape[0]):
    test_input = X_test.iloc[i,:-1]
    y_pred.append(chef.predict(model, test_input))
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)*100
print(accuracy)

70.08547008547008
