In [16]:
import pandas as pd
import numpy as np
import os 

In [32]:
mnist_train=pd.read_csv('mnist/mnist_train.csv',header=None)
mnist_test=pd.read_csv('mnist/mnist_test.csv',header=None)
print(mnist_train.shape)
mnist_test.shape

(60000, 785)


(10000, 785)

In [52]:
mnist_X_train=mnist_train.drop([0],axis=1)
mnist_X_test=mnist_test.drop([0],axis=1)
mnist_y_train=mnist_train.iloc[:,0]
mnist_y_test=mnist_test.iloc[:,0]

In [53]:
print(mnist_X_train.shape,mnist_y_train.shape)
print(mnist_X_test.shape,mnist_y_test.shape)

(60000, 784) (60000,)
(10000, 784) (10000,)


In [54]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(mnist_X_train,mnist_y_train,test_size=10000,random_state=42)

In [62]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [63]:
random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
svm_clf = LinearSVC(random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [64]:
for clf in [random_forest_clf,extra_trees_clf,svm_clf,mlp_clf]:
    clf.fit(X_train,y_train)
    print('For {} estimator confusion matrix of train is {}'.format(clf,pd.crosstab(y_train,clf.predict(X_train),rownames=['actual'],colnames=['predicted'])))

For RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False) estimator confusion matrix of train is col_0     0     1     2     3     4     5     6     7     8     9
0                                                                
0      4938     0     0     0     0     0     1     0     0     0
1         0  5649     0     0     0     0     0     0     0     0
2         1     0  4962     0     0     0     0     1     0     0
3         0     0     0  5130     0     0     0     1     0     0
4         0     0     1     0  4860     0     0     0     0     1
5         0     0     0     3     0  4499     0     0     0     0
6         0     0     0     0     0     1  4936 

In [65]:
from sklearn.metrics import accuracy_score
[print(mod.score(X_val,y_val)) for mod in [random_forest_clf,extra_trees_clf,svm_clf,mlp_clf]]

0.9446
0.9507
0.8765
0.951


[None, None, None, None]

In [67]:
#removing svm classifier
estimators=[random_forest_clf,extra_trees_clf,svm_clf,mlp_clf]
#creating a voting classifier
voting_class=[('random_forest',random_forest_clf),
             ('extra_trees',extra_trees_clf),
             ('svm',svm_clf),('mlp',mlp_clf)]
from sklearn.ensemble import VotingClassifier
vot_mod=VotingClassifier(voting_class)
vot_mod.fit(X_train,y_train)

VotingClassifier(estimators=[('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
   ...       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

In [68]:
vot_mod.score(X_val,y_val)

0.95850000000000002

In [73]:
#removing the svm classifier from the voting classifier
del vot_mod.estimators_[2]

In [75]:
vot_mod.score(X_val,y_val)# A much better as svm is decreasing the performance. 

0.96150000000000002

In [81]:
#now trying with soft_classifier 
# vot_mod.set_params()
vot_mod.voting='soft'

In [82]:
vot_mod.score(X_val,y_val)#On using soft classifier this accuracy is better to some extent

0.96540000000000004

In [91]:
vot_mod.score(mnist_X_test,mnist_y_test)

0.96550000000000002

In [94]:
from sklearn.metrics import classification_report
pd.crosstab(mnist_y_test,vot_mod.predict(mnist_X_test))
print(classification_report(mnist_y_test,vot_mod.predict(mnist_X_test)))

             precision    recall  f1-score   support

          0       0.97      0.99      0.98       980
          1       0.99      0.99      0.99      1135
          2       0.95      0.97      0.96      1032
          3       0.94      0.97      0.95      1010
          4       0.97      0.96      0.96       982
          5       0.99      0.95      0.97       892
          6       0.98      0.97      0.98       958
          7       0.98      0.96      0.97      1028
          8       0.95      0.95      0.95       974
          9       0.95      0.95      0.95      1009

avg / total       0.97      0.97      0.97     10000



In [143]:
# np.empty()
val_pred_3=pd.DataFrame(np.empty((X_val.shape[0],3)),columns=['random_forest','extra_trees','mlp'])
val_pred_3.iloc[:,0]=random_forest_clf.predict(X_val)
val_pred_3.iloc[:,1]=extra_trees_clf.predict(X_val)
val_pred_3.iloc[:,2]=mlp_clf.predict(X_val)


In [145]:
val_pred_3['Original_target']=y_val.values
val_pred_3[val_pred_3.random_forest!=val_pred_3.Original_target]

Unnamed: 0,random_forest,extra_trees,mlp,Original_target
15,9,9,9,7
17,4,5,8,8
31,2,2,8,8
34,3,5,5,5
47,7,7,7,2
114,8,3,3,3
142,5,8,8,6
152,3,3,3,9
163,3,2,2,2
167,9,7,9,7


In [146]:
val_pred_3.head()
X_stacked=val_pred_3.drop('Original_target',axis=1)
y_stacked=val_pred_3['Original_target']
from sklearn.linear_model import LogisticRegression
mod=LogisticRegression()
mod.fit(X_stacked,y_stacked)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [147]:
from sklearn.metrics import classification_report
mod_stacked=RandomForestClassifier(n_estimators=100,oob_score=True)
from sklearn.model_selection import cross_val_score
cross_val_score(mod_stacked,X_stacked,y_stacked,scoring='accuracy',cv=5)

array([ 0.96353646,  0.95952024,  0.95252374,  0.9535    ,  0.96092184])

In [148]:
mod_stacked.fit(X_stacked,y_stacked)
mod_stacked.oob_score_#This is how we create a blender using stacking models concept 

0.95689999999999997

In [149]:
X_stacked.head()

Unnamed: 0,random_forest,extra_trees,mlp
0,7,7,7
1,3,3,3
2,8,8,8
3,9,9,9
4,3,3,3


In [154]:
test_pred_df=pd.DataFrame(np.empty((mnist_X_test.shape[0],3)),columns=['rand_for','extra','mlps'])
for i,j in zip([random_forest_clf,extra_trees_clf,mlp_clf],list(test_pred_df.columns)):
    test_pred_df.loc[:,j]=i.predict(mnist_X_test)

In [159]:
accuracy_score(mnist_y_test,mod_stacked.predict(test_pred_df).reshape([-1]))#Test accuracy of blending models 
#relatively voting classing classifier has done better that too soft voting classifier 

0.96030000000000004

In [2]:
# Staged Predict
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)