In [58]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

In [62]:
!unzip '/content/wine.zip'

Archive:  /content/wine.zip
  inflating: wine.data               


In [65]:
data= pd.read_csv('/content/wine.data',names=['cultivator','alcohol','malic_acid','ash',
                                    'alcalinity','magnesium','total_phenols','falvanoids',
                                    'nonflavanoid_phenols','proanthocyanins','color',
                                    'hue','od280','proline'])
data

Unnamed: 0,cultivator,alcohol,malic_acid,ash,alcalinity,magnesium,total_phenols,falvanoids,nonflavanoid_phenols,proanthocyanins,color,hue,od280,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [66]:
x = data.drop('cultivator',axis=1)
y = data.cultivator

In [67]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size =0.30,random_state=0)

In [69]:
LR = LogisticRegression(solver='liblinear')

In [70]:

LR.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
metrics.accuracy_score(ytest,LR.predict(xtest))

0.9444444444444444

In [72]:
cm_LR = metrics.confusion_matrix(ytest,LR.predict(xtest))
cm_LR

array([[18,  1,  0],
       [ 1, 20,  1],
       [ 0,  0, 13]])

In [73]:
print(metrics.classification_report(ytest,LR.predict(xtest)))

              precision    recall  f1-score   support

           1       0.95      0.95      0.95        19
           2       0.95      0.91      0.93        22
           3       0.93      1.00      0.96        13

    accuracy                           0.94        54
   macro avg       0.94      0.95      0.95        54
weighted avg       0.94      0.94      0.94        54



In [74]:
knn = KNeighborsClassifier()

In [75]:

parameter = {'n_neighbors':np.arange(1,15),'weights':['uniform','distance']}
GS=GridSearchCV(knn,parameter,cv=5)
GS.fit(x,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [76]:
GS.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [77]:
knn = KNeighborsClassifier(n_neighbors=1,weights='uniform',metric='euclidean')

In [78]:
knn.fit(xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [79]:
metrics.accuracy_score(ytest,knn.predict(xtest))

0.7592592592592593

In [80]:
cm_knn = metrics.confusion_matrix(ytest,knn.predict(xtest))

In [81]:
print(metrics.classification_report(ytest,knn.predict(xtest)))

              precision    recall  f1-score   support

           1       0.89      0.84      0.86        19
           2       0.75      0.82      0.78        22
           3       0.58      0.54      0.56        13

    accuracy                           0.76        54
   macro avg       0.74      0.73      0.74        54
weighted avg       0.76      0.76      0.76        54



In [82]:
gnb = GaussianNB()

In [83]:
gnb.fit(xtrain,ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [84]:
metrics.accuracy_score(ytest,gnb.predict(xtest))

0.9444444444444444

In [85]:
cm_nb = metrics.confusion_matrix(ytest,gnb.predict(xtest))

In [86]:
print(metrics.classification_report(ytest,gnb.predict(xtest)))

              precision    recall  f1-score   support

           1       0.90      1.00      0.95        19
           2       1.00      0.86      0.93        22
           3       0.93      1.00      0.96        13

    accuracy                           0.94        54
   macro avg       0.94      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54



In [87]:

dt = DecisionTreeClassifier()

In [88]:
parameter ={'max_depth':np.arange(1,10),'criterion':['entropy','gini']}
GS=GridSearchCV(dt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['entropy', 'gini'],
                   

In [89]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 7}

In [90]:
dt = DecisionTreeClassifier(criterion='gini',max_depth=4)

In [91]:
dt.fit(xtrain,ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [92]:
metrics.accuracy_score(ytest,dt.predict(xtest))

0.9444444444444444

In [93]:
cm_dt=metrics.confusion_matrix(ytest,dt.predict(xtest))

In [94]:
print(metrics.classification_report(ytest,dt.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      0.89      0.94        19
           2       0.91      0.95      0.93        22
           3       0.93      1.00      0.96        13

    accuracy                           0.94        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54



In [95]:

rt = RandomForestClassifier(random_state=0)

In [96]:

parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10),'criterion':['gini','entropy']}
GS=GridSearchCV(rt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [97]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 8}

In [98]:
rf = RandomForestClassifier(n_estimators=8,criterion='entropy',max_depth=5,random_state=0)

In [99]:
rf.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=8,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [100]:
cm_rf = metrics.confusion_matrix(ytest,rf.predict(xtest))

In [101]:

print(metrics.classification_report(ytest,rf.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        13

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [103]:
bt_LR = AdaBoostClassifier(base_estimator=LR,n_estimators=50,random_state=0)

In [104]:
bt_gnb = AdaBoostClassifier(base_estimator=gnb,n_estimators=50,random_state=0)

In [105]:
bt_dt = AdaBoostClassifier(base_estimator=dt,n_estimators=50,random_state=0)

In [106]:
bt_rt = AdaBoostClassifier(base_estimator=rt,n_estimators=50,random_state=0)

In [107]:
models = []
models.append(('Base LR',LR))
models.append(('Boost LR',bt_LR))
models.append(('Base KNN',knn))
models.append(('Base gnb',gnb))
models.append(('Boost gnb',bt_gnb))
models.append(('Base dt',dt))
models.append(('Boost dt',bt_dt))
models.append(('Base RT',rt))
models.append(('Boost RT',bt_rt))

In [108]:

from sklearn.model_selection import KFold
kf = KFold(n_splits=3,shuffle=True,random_state=2)
for model,name in zip([LR,bt_LR,knn,gnb,bt_gnb,dt,bt_dt,rt,bt_rt],
                      ['Base LR','Boost LR','KNN','gnb','boost_gnb','dt','boost_dt','Base RT','Boost RT']):
    k=0
    recall=np.zeros((3,3))
    prec  =np.zeros((3,3))
    fscore=np.zeros((3,3))
    for train,test in kf.split(x,y):
        xtrain,xtest=x.iloc[train,:],x.iloc[test,:]
        ytrain,ytest=y[train],y[test]
        model.fit(xtrain,ytrain)
        y_predict = model.predict(xtest)
        cm=metrics.confusion_matrix(ytest,y_predict)
        for i in np.arange(0,3):
            recall[i,k]=cm[i,i]/cm[i,:].sum()
            prec[i,k]=cm[i,i]/cm[:,i].sum()
        k=k+1
    for row in np.arange(0,3):
        for col in np.arange(0,3):
            fscore[row,col]=2*(recall[row,col]*prec[row,col])/(recall[row,col]+prec[row,col])
    mean =(np.mean(fscore[0,:])+np.mean(fscore[1,:])+np.mean(fscore[2,:]))/3
    var  =((np.var(fscore[0,:],ddof=1)+np.var(fscore[1,:],ddof=1)+np.var(fscore[2,:],ddof=1))/3)
    print('f1_weighted Score: %0.02f (+/- %0.5f) [%s]' %(mean,var,name))

f1_weighted Score: 0.95 (+/- 0.00033) [Base LR]
f1_weighted Score: 0.92 (+/- 0.00149) [Boost LR]
f1_weighted Score: 0.72 (+/- 0.00096) [KNN]
f1_weighted Score: 0.97 (+/- 0.00112) [gnb]
f1_weighted Score: 0.97 (+/- 0.00019) [boost_gnb]
f1_weighted Score: 0.88 (+/- 0.00228) [dt]
f1_weighted Score: 0.86 (+/- 0.00556) [boost_dt]
f1_weighted Score: 0.98 (+/- 0.00034) [Base RT]
f1_weighted Score: 0.98 (+/- 0.00070) [Boost RT]


From the above scores we can say that Boosting the models has no use in this dataset.
For Decision tree alone there is an increase in f1 with a trade off of variance error.
Furthur it can be seen boosting can reduce bias with some little tradeoff of variance.