In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, cohen_kappa_score
from sklearn.model_selection import train_test_split,validation_curve
from sklearn import tree


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df_train = pd.read_csv("/kaggle/input/mnist-in-csv/mnist_train.csv")
df_test = pd.read_csv("/kaggle/input/mnist-in-csv/mnist_test.csv")

print(df_train.shape)
print(df_test.shape)

In [None]:
X = []
y = []
for row in df_train.iterrows() :
    label = row[1][0] # label (the number visible in the image)
    image = list(row[1][1:]) # image information as list, without label
    image = np.array(image) / 255
    X.append(image)
    y.append(label)

X = np.array(X)
y = np.array(y)
print(len(X))
print(len(y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(len(X_train), len(y_train))
print(X_train[1].shape)


In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred[0:20], ".....")
print(y_test[0:20], ".....")
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
X_new = []
for row in df_test.iterrows() :
    image = list(row[1][1:])
    image = np.array(image) / 255
    X_new.append(image)
X_new = np.array(X_new)
print(len(X_new))
print(len(df_test))

In [None]:
y_new_pred = clf.predict(X_new)
print(y_new_pred)

In [None]:
print("Cohen Kappa score is " + str(cohen_kappa_score(y_test, y_pred)))
print("Accuracy is " + str(metrics.accuracy_score(y_test, y_pred)))
cm = metrics.confusion_matrix(y_test,y_pred)
plt.figure(figsize=(9,9))
plt.imshow(cm, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix', size = 15)
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], rotation=45, size = 10)
plt.yticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], size = 10)
plt.tight_layout()
plt.ylabel('Actual label', size = 15)
plt.xlabel('Predicted label', size = 15)
width, height = cm.shape
for x in range(width):
 for y in range(height):
  plt.annotate(str(cm[x][y]), xy=(y, x), 
  horizontalalignment='center',
  verticalalignment='center')

In [None]:
from sklearn.model_selection import validation_curve
param_range = np.arange(1, 40000, 5000)

train_scores, test_scores = validation_curve(tree.DecisionTreeClassifier(),
                                  X_train,y_train, param_name="max_depth", param_range=param_range,
                                   scoring="accuracy", n_jobs=-1)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="r")
plt.plot(param_range, test_mean, label="Cross-validation score", color="g")

# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std)
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std)

# Create plot
plt.title("Validation Curve With Decision Tree")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
plt.save()