### Firstly, libraries are imported for our task. We might add libraries as and when needed in future also, but initially we chose this

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


import warnings
warnings.filterwarnings("ignore")


### Let's have a look at dataset now. We use tensorflow library to load

In [None]:
import tensorflow as tf

# Example of loading the MNIST dataset
(trainX, trainy), (testX, testy) = tf.keras.datasets.mnist.load_data()

# Summarize the loaded dataset
print(f"Train: X={trainX.shape}, y={trainy.shape}")
print(f"Test: X={testX.shape}, y={testy.shape}")

# Plot the first few images
for i in range(9):
    # Define a subplot
    plt.subplot(330 + 1 + i)
    # Plot the raw pixel data
    plt.imshow(trainX[i], cmap=plt.get_cmap('gray'))

# Show the figure
plt.show()

For our project however, we use csv file. From this link [https://www.kaggle.com/c/digit-recognizer/data?select=train.csv]

In [53]:
df = pd.read_csv('/content/train.csv')

In [54]:
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
output = df['label']
inputs = df.drop("label",axis=1)

In [56]:
print(output.shape, inputs.shape)

(42000,) (42000, 784)


### We are picking test_size randomly for our splitting of datsaset

In [57]:
import random
from sklearn.model_selection import train_test_split

test_values = [0.2, 0.5, 0.8, 0.01]

test4 = random.choice(test_values)

X_train, X_test, y_train, y_test = train_test_split(inputs, output, test_size=test4, random_state=42)

In [58]:
X_train.shape

(21000, 784)

In [None]:
#### Defining fit_predict

In [59]:
def fit_predict(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(f'Accuracy Score is: {accuracy_score(y_test, y_pred)}')
    print('\nClassification Report: ')
    print(classification_report(y_test, y_pred))
    print('\nConfusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))

In [None]:
## Fitting the various models and testing

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


# Initialize classifiers
nb = MultinomialNB()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
lr = LogisticRegression()
knn_ = KNeighborsClassifier(n_neighbors=5)

# Use the fit_predict_and_evaluate function for Naive Bayes
fit_predict(nb, X_train, y_train, X_test, y_test)

### Testing for various models

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Initialize classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Use the fit_predict_and_evaluate function for each classifier
for name, clf in classifiers.items():
    print(f"Results for {name}:")
    fit_predict(clf, X_train, y_train, X_test, y_test)


Results for Naive Bayes:
Accuracy Score is: 0.8228571428571428

Classification Report: 
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      2052
           1       0.88      0.94      0.91      2330
           2       0.88      0.83      0.86      2096
           3       0.80      0.81      0.80      2222
           4       0.82      0.73      0.77      2053
           5       0.84      0.65      0.73      1833
           6       0.87      0.91      0.89      2079
           7       0.94      0.82      0.88      2191
           8       0.65      0.78      0.71      2062
           9       0.69      0.82      0.75      2082

    accuracy                           0.82     21000
   macro avg       0.83      0.82      0.82     21000
weighted avg       0.83      0.82      0.82     21000


Confusion Matrix: 
[[1879    0    9    3    2   25   34    0   99    1]
 [   0 2179   22   12    0    2   13    1   95    6]
 [  22   35 1747   45   22 