In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# read the test data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')
print("shape of training data :", train_df.shape)
print("shape of test data :",test_df.shape)
train_df.head()

In [None]:
#  Separate the independent and dependent variables
X = train_df.drop('label',axis=1)
y = train_df['label']
#perform a test-train split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,train_size=0.7,random_state=0)
print("Train shape:", Xtrain.shape)
print("Evaluation shape:",Xtest.shape)

In [None]:
#Create 4 different classifiers to choose the best final model

#1. Basic DecisionTreeClassifier
#2. DecisionTreeClassifier with Hyperparameter tuning
#3. Basic RandomForestClassifier
#4. RandomForestClassifier with Hyperparameter tuning

In [None]:
#Let's perform a MinMaxScaling
sclr = MinMaxScaler()
Xtrain = sclr.fit_transform(Xtrain)
Xtest = sclr.fit_transform(Xtest)

#apply scalar to the test dataset as well
test_df = sclr.transform(test_df)

In [None]:
#Generic method to check the accuracy score and confusion matrix

from sklearn.metrics import confusion_matrix,accuracy_score
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(ytrain, dt_classifier.predict(Xtrain)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(ytrain, dt_classifier.predict(Xtrain)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(ytest, dt_classifier.predict(Xtest)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(ytest, dt_classifier.predict(Xtest)))

In [None]:
##### 1. DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=0,max_depth=5)
dt.fit(Xtrain,ytrain)
evaluate_model(dt)

In [None]:
###### 2. DecisionTreeClassifier with hyperparameter tuning

from sklearn.model_selection import GridSearchCV

params = {
    'max_depth':[2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100],
    'criterion': ['gini','entropy']
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt,
                            param_grid=params,
                            cv=4,n_jobs=-1,verbose=1,scoring='accuracy')

In [None]:
%%time
grid_search.fit(Xtrain,ytrain)

In [None]:
bestdt = grid_search.best_estimator_
evaluate_model(bestdt)

In [None]:
#####3. RandomForestClassifer
rf = RandomForestClassifier(random_state=0, n_estimators=100,n_jobs=-1)
rf.fit(Xtrain,ytrain)
evaluate_model(rf)

In [None]:
##### 4. RandomForestClassifier with Hyperparameter tuning

#Same parameters used in @@2 can also be used here for hyperparameter tuning.
grid_search = GridSearchCV(estimator=rf,
                            param_grid=params,
                            cv=4,n_jobs=-1,scoring='accuracy')

In [None]:
%%time
grid_search.fit(Xtrain,ytrain)

In [None]:
bestrf = grid_search.best_estimator_
bestrf

In [None]:
evaluate_model(bestrf)

In [None]:
#Let's check some classification reports
from sklearn.metrics import classification_report

print("Classification report for a simple RandomForestClassifier:")
print("")
print(classification_report(ytest,rf.predict(Xtest)))

print("Classification report for a simple DecisionTreeClassifier:")
print("")
print(classification_report(ytest,dt.predict(Xtest)))

print("Classification report for the RandomForestClassifier with hyperparameters tuning:")
print("")
print(classification_report(ytest,bestrf.predict(Xtest)))

print("Classification report for the DecisionTreeClassifier with hyperparameters tuning:")
print("")
print(classification_report(ytest,bestdt.predict(Xtest)))

In [None]:
#Sometimes, the simplest of the models gives us the best results.
#The initial RandomForestClassifier that we used gave us an accuracy of 0.9616 with a precision and recall of 0.96 as well.

#Thus, we will consider the simple RandomForestClassifier as our final model

In [None]:
ytest_pred = rf.predict(test_df)

In [None]:
#Create a report based on our final model
df_result = pd.DataFrame(ytest_pred, columns=['Label'], index=np.arange(1,28001))
df_result.to_csv('submission.csv',index_label='ImageId')