In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline
seed = 1142

In [3]:
df = pd.read_csv("data/diabetes.csv")
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = df.drop("class", axis = 1)
y = df["class"]

In [5]:
sc = StandardScaler()
X = sc.fit_transform(X)
X.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(768, 8)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=seed)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((576, 8), (576,), (192, 8), (192,))

In [9]:
# Radial Support Vector Machines(rbf-SVM)
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(X_train,y_train)
prediction1=model.predict(X_test)
print('Accuracy for rbf SVM is ',accuracy_score(prediction1,y_test))

Accuracy for rbf SVM is  0.7604166666666666


In [10]:
# Linear Support Vector Machine(linear-SVM)
model=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(X_train,y_train)
prediction2=model.predict(X_test)
print('Accuracy for linear SVM is',accuracy_score(prediction2,y_test))

Accuracy for linear SVM is 0.7708333333333334


In [11]:
# Logistic Regression
model=LogisticRegression()
model.fit(X_train,y_train)
prediction3=model.predict(X_test)
print('The accuracy of the Logistic Regression is',accuracy_score(prediction3,y_test))

The accuracy of the Logistic Regression is 0.7708333333333334


In [12]:
# Decision Tree
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
prediction4=model.predict(X_test)
print('The accuracy of the Decision Tree is',accuracy_score(prediction4,y_test))

The accuracy of the Decision Tree is 0.6510416666666666


In [13]:
# Perceptron
model=Perceptron()
model.fit(X_train,y_train)
prediction5=model.predict(X_test)
print('The accuracy of the Perceptron is',accuracy_score(prediction5,y_test))

The accuracy of the Perceptron is 0.671875


In [14]:
# kNN
model=KNeighborsClassifier(n_neighbors=8) 
model.fit(X_train,y_train)
prediction6=model.predict(X_test)
print('The accuracy of the KNN is',accuracy_score(prediction6,y_test))

The accuracy of the KNN is 0.7135416666666666


In [15]:
# Gaussian Naive Bayes
model=GaussianNB()
model.fit(X_train,y_train)
prediction7=model.predict(X_test)
print('The accuracy of the NaiveBayes is',accuracy_score(prediction7,y_test))

The accuracy of the NaiveBayes is 0.7916666666666666


In [16]:
# Random Forest
model=RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)
prediction8=model.predict(X_test)
print('The accuracy of the Random Forests is',accuracy_score(prediction8,y_test))

The accuracy of the Random Forests is 0.7447916666666666


In [17]:
# Gradient Boosting Classifier
model=GradientBoostingClassifier(n_estimators=500,random_state=0,learning_rate=0.1)
model.fit(X_train,y_train)
prediction9=model.predict(X_test)
print('The accuracy of the Gradient Boosting Classifier is',accuracy_score(prediction9,y_test))

The accuracy of the Gradient Boosting Classifier is 0.7708333333333334


In [18]:
# Ada Boosting Classifier
model=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
model.fit(X_train,y_train)
prediction10=model.predict(X_test)
print('The accuracy of the Adaboost is',accuracy_score(prediction10,y_test))

The accuracy of the Adaboost is 0.7760416666666666


In [19]:
# XGBoost
model=xgb.XGBClassifier(n_estimators=900,learning_rate=0.1)
model.fit(X_train,y_train)
prediction10=model.predict(X_test)
print('The accuracy of the XGBoost is',accuracy_score(prediction10,y_test))

The accuracy of the XGBoost is 0.7552083333333334


In [21]:
# Cross Validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest', 'Gradient Boosting', 'Adaboost', 'XGBoost']
models=[svm.SVC(kernel='linear'),
        svm.SVC(kernel='rbf'),
        LogisticRegression(),
        KNeighborsClassifier(n_neighbors=9),
        DecisionTreeClassifier(),
        GaussianNB(),
        RandomForestClassifier(n_estimators=100),
        GradientBoostingClassifier(n_estimators=500,random_state=0,learning_rate=0.1),
        AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1),
        xgb.XGBClassifier(n_estimators=900,learning_rate=0.1)]
for i in models:
    model = i
    cv_result = cross_val_score(model, X, y, cv = kfold ,scoring = "accuracy")
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
print(new_models_dataframe2)

                      CV Mean       Std
Linear Svm           0.773411  0.045568
Radial Svm           0.764286  0.056962
Logistic Regression  0.779956  0.050088
KNN                  0.743506  0.071099
Decision Tree        0.697830  0.059355
Naive Bayes          0.755178  0.042766
Random Forest        0.761705  0.062110
Gradient Boosting    0.744771  0.052108
Adaboost             0.766900  0.058356
XGBoost              0.738295  0.049746


In [23]:
# Hyper Parameter Tuning
from sklearn.model_selection import GridSearchCV
n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X, y)

# Output Submission file
PassengerId = test['PassengerId']
test_data = test.drop(['PassengerId'], axis=1)

#model = AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
#model.fit(predictors, target)
#y_pred = model.predict(test_data)
y_pred = gd.predict(test_data)

# FILENAME = "../result/adaboost.csv"
# sub = pd.DataFrame({'PassengerId' : PassengerId, 'Survived' : y_pred})
# sub.to_csv(FILENAME, index=False)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  4.7min finished


NameError: name 'test' is not defined