# Module 11 - Model Selection and Boosting

### Case Study I

In [21]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.model_selection import cross_val_score
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.linear_model import LogisticRegression # to apply the Logistic regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline

In [22]:
bio_df = pd.read_csv('bio-degradable-data.csv', sep=";", header=None)
print(bio_df.shape)
bio_df.head()

(1055, 42)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,0,2.949,1.591,0,7.253,0,0,RB
1,4.17,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,0,3.315,1.967,0,7.257,0,0,RB
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,0,1,3.076,2.417,0,7.601,0,0,RB
3,3.0,2.7098,0,0,0,0,0,20.0,0,2,...,0,0,1,3.046,5.0,0,6.69,0,0,RB
4,4.236,3.3944,0,0,0,0,0,29.4,2,4,...,0,0,0,3.351,2.405,0,8.003,0,0,RB


In [23]:
prediction_var = bio_df.columns
prediction_var = prediction_var.drop(41)
prediction_var

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40],
           dtype='int64')

In [24]:
le = preprocessing.LabelEncoder()
bio_df[41] = le.fit_transform(bio_df[41])
bio_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
1050,5.431,2.8955,0,0,0,2,0,32.1,4,1,...,0,6,1,3.573,2.242,1,8.088,0,0,0
1051,5.287,3.3732,0,0,9,0,0,35.3,0,9,...,0,3,0,3.787,3.083,3,9.278,0,0,0
1052,4.869,1.767,0,1,9,0,5,44.4,0,4,...,4,13,0,3.848,2.576,5,9.537,1,0,0
1053,5.158,1.6914,2,0,36,0,9,56.1,0,0,...,1,16,0,5.808,2.055,8,11.055,0,1,0
1054,5.076,2.6588,2,0,0,0,4,54.5,0,0,...,2,0,0,4.009,2.206,0,9.13,0,2,0


In [25]:
X = bio_df[prediction_var]
Y = bio_df[41]
kfold = model_selection.KFold(n_splits=10, random_state=7)
abc = AdaBoostClassifier(n_estimators=30, random_state=7)
results = model_selection.cross_val_score(abc, X, Y, cv=kfold)
print(results.mean())

0.8133423180592991


### Case Study II

In [26]:
glass_df = pd.read_csv('glass.csv', sep=",")
print(glass_df.shape)
glass_df.head()

(214, 10)


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [27]:
prediction_var = glass_df.columns
prediction_var = prediction_var.drop('Type')
prediction_var

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [28]:
train2, test2 = train_test_split(glass_df, test_size = 0.2)# in this our main data is splitted into train and test
# we can check their dimension
print(train2.shape)
print(test2.shape)
train2_X = train2[prediction_var]# taking the training data input
train2_y=train2['Type']# This is output of our training data
# same we have to do for test
test2_X= test2[prediction_var] # taking test data inputs
test2_y =test2['Type']   #output value of test dat

(171, 10)
(43, 10)


In [29]:
#Decision Tree classifier
model = DecisionTreeClassifier()
model.fit(train2_X,train2_y)# now fit our model for traiing data
prediction=model.predict(test2_X)
print(metrics.accuracy_score(prediction,test2_y)) # to check the accuracy

0.6511627906976745


In [30]:
model2 = DecisionTreeClassifier()
clf = make_pipeline(preprocessing.StandardScaler(), model2)
X = glass_df[prediction_var]
Y = glass_df['Type']
#kfold = model_selection.KFold(n_splits=3, random_state=7)
ss = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
results = model_selection.cross_val_score(clf, X, Y, cv=ss)
print(results)

[0.64615385 0.70769231 0.64615385]


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

parameters = {'n_estimators': [10, 100, 200, 500]}

ss = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
clf = GridSearchCV(RandomForestClassifier(), parameters, cv=ss, scoring='precision_macro')
clf.fit(X, Y)
print(clf.best_params_)
clf.cv_results_
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'n_estimators': 200}
0.709 (+/-0.195) for {'n_estimators': 10}
0.774 (+/-0.151) for {'n_estimators': 100}
0.795 (+/-0.143) for {'n_estimators': 200}
0.775 (+/-0.136) for {'n_estimators': 500}


### Case Study III

In [33]:
letter_df = pd.read_csv('letterCG.data', sep=" ")
print(letter_df.shape)
letter_df.head()

(1509, 18)


Unnamed: 0,Class,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,Unnamed: 17
0,C,4,10,5,8,2,5,7,7,10,7,6,13,1,8,4,9,
1,C,3,5,4,4,2,4,8,5,7,12,9,11,1,10,2,7,
2,G,5,10,6,7,3,8,7,8,8,6,7,9,2,7,5,10,
3,C,5,9,6,7,4,3,8,6,7,12,10,13,1,9,3,7,
4,G,4,8,5,6,3,6,6,7,6,10,7,12,2,9,4,9,


In [34]:
prediction_var = letter_df.columns
prediction_var = prediction_var.drop('Class')
prediction_var = prediction_var.drop('Unnamed: 17')
prediction_var

Index(['x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar',
       'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx'],
      dtype='object')

In [35]:
train3, test3 = train_test_split(letter_df, test_size = 0.2)# in this our main data is splitted into train and test
# we can check their dimension
print(train3.shape)
print(test3.shape)
train3_X = train3[prediction_var]# taking the training data input
train3_y=train3['Class']# This is output of our training data
# same we have to do for test
test3_X= test3[prediction_var] # taking test data inputs
test3_y =test3['Class']   #output value of test dat

(1207, 18)
(302, 18)


In [36]:
dtc_model = DecisionTreeClassifier(max_depth=1)
for l_rate in [1, 2, 3, 4, 8, 12, 16]:
    ada_clf = AdaBoostClassifier(dtc_model, n_estimators=30, random_state=7, learning_rate=l_rate)
    ada_clf.fit(train3_X, train3_y)
    prediction = ada_clf.predict(test3_X)
    print(metrics.accuracy_score(prediction,test3_y)) # to check the accuracy

0.9337748344370861
0.7682119205298014
0.2052980132450331
0.2052980132450331
0.2052980132450331
0.49337748344370863
0.2052980132450331


In [37]:
dtc_model = DecisionTreeClassifier(max_depth=2)
for l_rate in [1, 2, 3, 4, 8, 12, 16]:
    ada_clf = AdaBoostClassifier(dtc_model, n_estimators=30, random_state=7, learning_rate=l_rate)
    ada_clf.fit(train3_X, train3_y)
    prediction = ada_clf.predict(test3_X)
    print(metrics.accuracy_score(prediction,test3_y)) # to check the accuracy

0.956953642384106
0.9139072847682119
0.8013245033112583
0.2052980132450331
0.49337748344370863
0.20198675496688742
0.20198675496688742
