In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix, cohen_kappa_score, classification_report
from sklearn.model_selection import validation_curve, GridSearchCV
plt.style.use("seaborn")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_file = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv')
test_file = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_test.csv')

In [None]:
train_file.head()

In [None]:
test_file.head()

In [None]:
#list of all digits that are going to be predicted
np.sort(train_file.label.unique())

In [None]:
#define the number of samples for training set and for validation set
num_train,num_validation = int(len(train_file)*0.9),int(len(train_file)*0.1)

In [None]:
num_train,num_validation

In [None]:
#generate training data from train_file
x_train,y_train=train_file.iloc[:num_train,1:].values,train_file.iloc[:num_train,0].values

In [None]:
x_validation,y_validation=train_file.iloc[num_train:,1:].values,train_file.iloc[num_train:,0].values

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_validation.shape)
print(y_validation.shape)

In [None]:

"""
clf_RF = RandomForestClassifier(random_state=0)
param_grid = {'max_depth': [15], 'max_features': [100],  
              'min_samples_split': [5],'n_estimators' : [50] }
GridCV_RF = GridSearchCV(clf_RF, param_grid, verbose=1, cv=5)
GridCV_RF.fit(x_train,y_train)
score_grid_RF = get_best_score(GridCV_RF)
"""

In [None]:
classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features=100, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
classifier.fit(x_train,y_train)

In [None]:
prediction_validation = classifier.predict(x_validation)

In [None]:
print("Validation Accuracy: " + str(accuracy_score(y_validation,prediction_validation)))

In [None]:
x_test=test_file.iloc[:num_train,1:].values

In [None]:
#predict test data
prediction_test = classifier.predict(x_test)

In [None]:
print("Accuracy is " + str(metrics.accuracy_score(y_validation,prediction_validation)))
print("Cohen Kappa score is " + str(cohen_kappa_score(y_validation,prediction_validation)))
print(classification_report(y_validation, prediction_validation))

In [None]:
cm = metrics.confusion_matrix(y_validation,prediction_validation)
plt.figure(figsize=(9,9))
plt.imshow(cm, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix', size = 15)
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], rotation=45, size = 10)
plt.yticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], size = 10)
plt.tight_layout()
plt.ylabel('Actual label', size = 15)
plt.xlabel('Predicted label', size = 15)
width, height = cm.shape
for x in range(width):
 for y in range(height):
  plt.annotate(str(cm[x][y]), xy=(y, x), 
  horizontalalignment='center',
  verticalalignment='center')

In [None]:

param_range = np.arange(1,40000,500)

train_scores, test_scores = validation_curve(RandomForestClassifier(),
                                  x_train,y_train, param_name="n_estimators", param_range=param_range,
                                   scoring="accuracy", n_jobs=-1)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="r")
plt.plot(param_range, test_mean, label="Cross-validation score", color="g")

# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std)
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std)

# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()