### Loading MNIST dataset using sklearn.datasets

In [49]:
import pandas as pd
from sklearn.datasets import fetch_openml

In [50]:
# Load MNIST from OpenML
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

In [51]:
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [52]:
X

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
y

Unnamed: 0,class
0,5
1,0
2,4
3,1
4,9
...,...
69995,2
69996,3
69997,4
69998,5


In [54]:
print(X.columns)

Index(['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7',
       'pixel8', 'pixel9', 'pixel10',
       ...
       'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
       'pixel781', 'pixel782', 'pixel783', 'pixel784'],
      dtype='object', length=784)


In [55]:
y = y.astype(int)

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
print(X_train.shape)
print(y_train.shape)

(56000, 784)
(56000,)


### Applying Log Reg and DT

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [60]:
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [61]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [62]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)

### Accuracy and Confusion Matrix

In [63]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of Logistic Regression",accuracy_score(y_test,y_pred1))
print("Accuracy of Decision Trees",accuracy_score(y_test,y_pred2))

Accuracy of Logistic Regression 0.9178571428571428
Accuracy of Decision Trees 0.8698571428571429


In [64]:
print("Logistic Regression Confusion Matrix\n")
pd.DataFrame(confusion_matrix(y_test,y_pred1))

Logistic Regression Confusion Matrix



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1289,1,6,0,5,14,12,6,8,2
1,0,1557,5,9,2,7,0,3,15,2
2,4,20,1229,25,16,8,19,15,35,9
3,6,7,30,1298,1,38,4,14,19,16
4,4,0,8,5,1194,4,13,6,12,49
5,7,11,8,53,16,1087,20,2,54,15
6,7,3,20,0,14,14,1332,2,4,0
7,5,4,28,4,8,5,0,1408,2,39
8,11,24,17,38,7,39,12,10,1186,13
9,7,10,7,14,40,5,0,45,22,1270


In [65]:
print("Decision Tree Confusion Matrix\n")
pd.DataFrame(confusion_matrix(y_test,y_pred2))

Decision Tree Confusion Matrix



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1235,2,20,10,9,13,32,5,12,5
1,2,1524,6,10,16,3,7,11,14,7
2,18,19,1143,41,23,15,27,28,44,22
3,11,10,37,1202,10,63,11,24,33,32
4,10,3,14,13,1121,14,20,14,24,62
5,22,17,14,63,17,1053,22,5,29,31
6,19,6,15,8,21,23,1258,3,36,7
7,3,15,16,18,18,6,1,1370,19,37
8,12,12,59,48,38,34,23,11,1089,31
9,9,6,16,30,68,27,9,36,36,1183


### Precision, Recall and F1-Score

In [66]:
from sklearn.metrics import precision_score,recall_score,f1_score

In [67]:
precision_score(y_test,y_pred1,average=None)

array([0.9619403 , 0.95113012, 0.90500736, 0.89764869, 0.91634689,
       0.89025389, 0.94334278, 0.93183322, 0.87398674, 0.8975265 ])

In [68]:
recall_score(y_test,y_pred1,average=None)

array([0.95979151, 0.973125  , 0.89057971, 0.90579204, 0.92200772,
       0.85388845, 0.95415473, 0.93679308, 0.87398674, 0.8943662 ])

In [69]:
f1_score(y_test,y_pred1,average=None)

array([0.9608647 , 0.96200185, 0.89773557, 0.90170198, 0.91916859,
       0.87169206, 0.94871795, 0.93430657, 0.87398674, 0.89594356])

### macro & weighted - Precision, Recall and F1

In [71]:
# macro precision
precision_score(y_test,y_pred1,average='macro')

0.916901648286388

In [72]:
# weighted precision
precision_score(y_test,y_pred1,average='weighted')

0.9176311889392749

In [73]:
# macro recall
recall_score(y_test,y_pred1,average='macro')

0.9164485181758281

In [74]:
# weighted recall
recall_score(y_test,y_pred1,average='weighted')

0.9178571428571428

In [76]:
# macro f1
f1_score(y_test,y_pred1,average='macro')

0.9166119578810752

In [77]:
# weighted f1
f1_score(y_test,y_pred1,average='weighted')

0.9176827499615795

### Classification Report

In [78]:
# For logistic regression
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1343
           1       0.95      0.97      0.96      1600
           2       0.91      0.89      0.90      1380
           3       0.90      0.91      0.90      1433
           4       0.92      0.92      0.92      1295
           5       0.89      0.85      0.87      1273
           6       0.94      0.95      0.95      1396
           7       0.93      0.94      0.93      1503
           8       0.87      0.87      0.87      1357
           9       0.90      0.89      0.90      1420

    accuracy                           0.92     14000
   macro avg       0.92      0.92      0.92     14000
weighted avg       0.92      0.92      0.92     14000



In [79]:
# For Decision Tree (DT)
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1343
           1       0.94      0.95      0.95      1600
           2       0.85      0.83      0.84      1380
           3       0.83      0.84      0.84      1433
           4       0.84      0.87      0.85      1295
           5       0.84      0.83      0.83      1273
           6       0.89      0.90      0.90      1396
           7       0.91      0.91      0.91      1503
           8       0.82      0.80      0.81      1357
           9       0.83      0.83      0.83      1420

    accuracy                           0.87     14000
   macro avg       0.87      0.87      0.87     14000
weighted avg       0.87      0.87      0.87     14000

