In [14]:
# 07/02/2023 <---- Started working on
# Author: Pushpraj Katiyar
# email: pk825@snu.edu.in <--- for any query, reach out to this email
# Roll no: 2220120001

#let's import all useful packages

# dataset is provided in form of a zip file, to extract it let's import zipfile 
import zipfile
#To read extracted dataset csv, let's import panda 
import pandas as pd

# Let's import required sklearn lib methods,
# Documentation can be found at https://scikit-learn.org/
from sklearn.model_selection import train_test_split # <----- train_test_split Split arrays into random train and test subsets.
from sklearn.preprocessing import StandardScaler     # <----- It removes the mean and scaling to unit variance.
from sklearn.metrics import accuracy_score

# Import all the required classifier from sklearn lib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# getting some upcoming deprication warning due to installed python version. bit of non essencial code
import warnings
warnings.filterwarnings('ignore')

# Extract the CSV file from the ZIP file
with zipfile.ZipFile("MNIST_Dataset.zip", "r") as zip_ref:
    zip_ref.extractall("MNIST_Dataset")

In [15]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv("MNIST_Dataset/mnist.csv")

# Split features and target
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [16]:
# Apply KNN classifier and find the accuracy score
accuracy = []
for k in range(1, 11):
#     print("Calculating accuracy for: K =", k)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print("Accuracy for: K =",k, "is: ", accuracy_score(y_test, y_pred))
    accuracy.append(accuracy_score(y_test, y_pred))

# Determine the best K-value
best_k = accuracy.index(max(accuracy)) + 1
print("Best K-value:", best_k)

Accuracy for: K = 1 is:  0.9353968253968254
Accuracy for: K = 2 is:  0.9246825396825397
Accuracy for: K = 3 is:  0.9364285714285714
Accuracy for: K = 4 is:  0.9334920634920635
Accuracy for: K = 5 is:  0.935952380952381
Accuracy for: K = 6 is:  0.9343650793650794
Accuracy for: K = 7 is:  0.933968253968254
Accuracy for: K = 8 is:  0.9346031746031747
Accuracy for: K = 9 is:  0.9330952380952381
Accuracy for: K = 10 is:  0.9325396825396826
Best K-value: 3


In [21]:

# Apply KNN classifier and find the accuracy score
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred)
print("Final KNN accuracy: ", knn_accuracy)

# Apply Decision Tree classifier and pruning methods to improve the testing accuracy
dt = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)

print("Final Decision tree accuracy: ", dt_accuracy)

Final KNN accuracy:  0.9364285714285714
Final Decision tree accuracy:  0.8646031746031746


In [29]:
#for best results, we need to identify best value of n_estimators for Random forest clasifier which is number of trees in the forest
rf_accuracy = []
for n in range(10, 100):
#     print("Calculating accuracy for: K =", k)
    rf = RandomForestClassifier(n_estimators=n, criterion='entropy', random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print("Accuracy for: n_estimators =",n, "is: ", accuracy_score(y_test, y_pred))
    rf_accuracy.append(accuracy_score(y_test, y_pred))
    
# Determine the best K-value
best_n = rf_accuracy.index(max(rf_accuracy)) + 1
print("Best n_estimators value:", best_n)


Accuracy for: n_estimators = 10 is:  0.9371428571428572
Accuracy for: n_estimators = 11 is:  0.9399206349206349
Accuracy for: n_estimators = 12 is:  0.943968253968254
Accuracy for: n_estimators = 13 is:  0.9448412698412698
Accuracy for: n_estimators = 14 is:  0.9452380952380952
Accuracy for: n_estimators = 15 is:  0.947936507936508
Accuracy for: n_estimators = 16 is:  0.9486507936507936
Accuracy for: n_estimators = 17 is:  0.9503968253968254
Accuracy for: n_estimators = 18 is:  0.9521428571428572
Accuracy for: n_estimators = 19 is:  0.9523015873015873
Accuracy for: n_estimators = 20 is:  0.9526984126984127
Accuracy for: n_estimators = 21 is:  0.9534126984126984
Accuracy for: n_estimators = 22 is:  0.952936507936508
Accuracy for: n_estimators = 23 is:  0.9538095238095238
Accuracy for: n_estimators = 24 is:  0.954047619047619
Accuracy for: n_estimators = 25 is:  0.9542063492063492
Accuracy for: n_estimators = 26 is:  0.9545238095238096
Accuracy for: n_estimators = 27 is:  0.9542857142857

In [30]:
# Apply Random Forest classifier
rf = RandomForestClassifier(n_estimators=best_n, criterion='entropy', random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Final Random forest accuracy: ", rf_accuracy)

Final Random forest accuracy:  0.9587301587301588


In [31]:
# Compare the performance of the three classifiers
print("KNN Accuracy:", knn_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

# Suggest the best classifier for the given dataset
if knn_accuracy == max(knn_accuracy, dt_accuracy, rf_accuracy):
    print("KNN is the best classifier for the given dataset.")
elif dt_accuracy == max(knn_accuracy, dt_accuracy, rf_accuracy):
    print("Decision Tree is the best classifier for the given dataset.")
else:
    print("Random Forest is the best classifier for the given dataset.")

KNN Accuracy: 0.9364285714285714
Decision Tree Accuracy: 0.8646031746031746
Random Forest Accuracy: 0.9587301587301588
Random Forest is the best classifier for the given dataset.
