In [233]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics


In [234]:
data = pd.read_csv("ILPD.csv")
data = data.dropna()
data

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [235]:
data.Gender = data.Gender.astype('category').cat.codes
data

Unnamed: 0,Age,Gender,TB,DB,AP,SGPT,SGOT,TP,ALB,A/G,Target
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [236]:
models = {}
models['Logistic Regression'] = AdaBoostClassifier(
    LogisticRegression(max_iter=200), n_estimators=10, learning_rate=0.1)
models['Decision Tree'] = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=10, learning_rate=0.1)
models['Extra Tree'] = AdaBoostClassifier(
    ExtraTreeClassifier(max_depth=1), n_estimators=10, learning_rate=0.1)
models['Random Forest'] = AdaBoostClassifier(
    RandomForestClassifier(max_depth=1), n_estimators=20, learning_rate=0.2)
models['Naive Bayes'] = AdaBoostClassifier(
    GaussianNB(), n_estimators=10, learning_rate=0.1)
svc = SVC(probability=True, kernel='linear')

models['SVM'] = AdaBoostClassifier(
    n_estimators=10, base_estimator=svc, learning_rate=0.1)


In [237]:
X, Y = data.iloc[:, :-1], data.iloc[:, -1]

## Without StratifiedKfold cross validation

In [238]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=7, shuffle=True)

In [239]:
acc_scores = []
model_names = []
for name, model in models.items():
    model = model.fit(X_train, y_train)
    #Predict the response for test dataset
    y_pred = model.predict(X_test)
    # Model Accuracy, how often is the classifier correct?
    print('%s : %f' % (name, metrics.accuracy_score(y_test, y_pred)))

Logistic Regression : 0.776042
Decision Tree : 0.755208
Extra Tree : 0.755208
Random Forest : 0.718750
Naive Bayes : 0.500000
SVM : 0.755208


## With StratifiedKfold cross validation

In [240]:
acc_scores = []
model_names = []
for name, model in models.items():
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)
    scores = cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
    acc_scores.append(scores)
    model_names.append(name)
    print('%s : %f' % (name, np.mean(scores)))


Logistic Regression : 0.711572
Decision Tree : 0.715026
Extra Tree : 0.715026
Random Forest : 0.701209
Naive Bayes : 0.580311
SVM : 0.716753


In [241]:
# yhat = model.predict(row)
# print('Predicted Class: %d' % yhat[0])