In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Processed_ILPD.csv")
df

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65.0,0.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1.0
1,62.0,1.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,1.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,1.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1.0
4,72.0,1.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1.0
...,...,...,...,...,...,...,...,...,...,...,...
578,60.0,1.0,0.5,0.1,500.0,20.0,34.0,5.9,1.6,0.37,2.0
579,40.0,1.0,0.6,0.1,98.0,35.0,31.0,6.0,3.2,1.10,1.0
580,52.0,1.0,0.8,0.2,245.0,48.0,49.0,6.4,3.2,1.00,1.0
581,31.0,1.0,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1.0


In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [21]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X)
scaled_X = scaler.transform(X)
scaled_X = pd.DataFrame(scaled_X, columns=X.columns)

min_max_scaler = preprocessing.MinMaxScaler()
minmax_X = min_max_scaler.fit_transform(X)
minmax_X = pd.DataFrame(minmax_X, columns=X.columns)

max_abs_scaler = preprocessing.MaxAbsScaler()
maxabs_X = max_abs_scaler.fit_transform(X)
maxabs_X = pd.DataFrame(maxabs_X, columns=X.columns)

scaled_data = {
    'Standardized' : scaled_X,
    'Min_Max': minmax_X,
    'Absolute_max': maxabs_X
}


In [52]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_classif

In [75]:
feature_selection = []
for name in scaled_data :
    if name!='Standardized':
        d1 = {}
        d1['Processing'] = name
        d1['Method'] = "Univariate Selection"
        chi_selector = SelectKBest(chi2, k='all')
        fit = chi_selector.fit(scaled_data[name], y)
        d1['Features'] = dict(sorted(zip(scaled_data[name].columns, fit.scores_), key = lambda x: x[1],reverse=True))
        feature_selection.append(d1)
    
    ###############################

    d2 = {}
    d2['Processing'] = name
    d2['Method'] = "Extra Tree Classifier"
    etc = ExtraTreesClassifier()
    etc.fit(scaled_data[name],y)
    d2['Features'] = dict(sorted(zip(scaled_data[name].columns, etc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d2)

    ###############################

    d3 = {}
    d3['Processing'] = name
    d3['Method'] = "Random Forest Classifier"
    rf = RandomForestClassifier(n_estimators=500, random_state=7)
    rf.fit(scaled_data[name], y)
    d3['Features'] = dict(sorted(zip(scaled_data[name].columns, rf.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d3)

    ###############################

    d4 = {}
    d4['Processing'] = name
    d4['Method'] = "LGBM Classifier"
    lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.0001,
                    num_leaves=32, colsample_bytree=0.2,                                           
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01,    
                    min_child_weight=40)
    lgbc.fit(scaled_data[name], y)
    d4['Features'] = dict(sorted(zip(X.columns, lgbc.feature_importances_), key = lambda x: x[1],reverse=True))
    feature_selection.append(d4)

    ###############################

    d5 = {}
    d5['Processing'] = name
    d5['Method'] = "Correlation Matrix"
    t = pd.concat([scaled_data[name], y], axis=1)
    corrmat = t.corr()
    d5['Features'] = dict(sorted(zip(corrmat.iloc[:-1,:]['Target'].index,abs(corrmat.iloc[:-1,:]['Target'].values)),key = lambda x: x[1],reverse=True)) 
    feature_selection.append(d5) 

    ###############################

    d6 = {}
    d6['Processing'] = name
    d6['Method'] = "ANOVA F-SCORES"
    anov_selector = SelectKBest(f_classif, k='all')
    fit = anov_selector.fit(scaled_data[name], y)
    d6['Features'] = dict(sorted(zip(scaled_data[name].columns, fit.scores_), key = lambda x: x[1], reverse=True))
    feature_selection.append(d6) 
    

In [154]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [155]:
results = []
best_result = {}
cross_best_result = {}
max_accuracy = 0
max_cross_acc = 0
for method in feature_selection:
    data = scaled_data[method['Processing']]
    for i in range(3,11):
        info = {}
        info['Processing'] = method['Processing']
        info['Method'] = method['Method']
        info['Features'] = list(method['Features'].keys())[:i]
        
        sub_data = data[list(method['Features'].keys())[:i]]
        X_train, X_test, y_train, y_test = train_test_split(sub_data, y, test_size=0.30, random_state=100)
        
        model = XGBClassifier()
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)*100

        # K-Fold Cross Validation
        kfold = StratifiedKFold(n_splits=10)
        crossval_results = cross_val_score(model, sub_data, y, cv=kfold)
        cross_acc = crossval_results.mean()*100
        info['Cross Validation Accuracy'] = cross_acc
        info['Accuracy'] = accuracy
        if accuracy>max_accuracy:
            max_accuracy = accuracy
            best_result = info
        if cross_acc>max_cross_acc:
            max_cross_acc = cross_acc
            cross_best_result = info



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [156]:
best_result

{'Processing': 'Min_Max',
 'Method': 'Random Forest Classifier',
 'Features': ['AP', 'SGOT', 'Age', 'SGPT'],
 'Cross Validation Accuracy': 66.71537112799533,
 'Accuracy': 75.42857142857143}

In [157]:
cross_best_result

{'Processing': 'Absolute max',
 'Method': 'Extra Tree Classifier',
 'Features': ['AP', 'Age', 'SGOT', 'SGPT', 'TB'],
 'Cross Validation Accuracy': 70.67212156633549,
 'Accuracy': 72.57142857142857}