# Handwritten Digits Recognition using Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [2]:
# split dataset into 50/50 according to custom in Digits dataset
X_train = np.load('../dataset/X_train.npy')
y_train = np.load('../dataset/y_train.npy')

In [3]:
#specify instances of classes for each processing step
scaler = StandardScaler()
pca = PCA(svd_solver='auto')
classifier = LogisticRegression(max_iter=500, multi_class='multinomial')

#combine in pipeline
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('classifier', classifier)])

In [4]:
#specify parameter space to search
param_grid = {
    'pca__n_components': np.linspace(2, 10, 8, dtype=int),
}

In [5]:
search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=2, cv=5, verbose=4, refit=True)
search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 2/5] END ...............pca__n_components=2;, score=0.605 total time=   0.1s
[CV 1/5] END ...............pca__n_components=2;, score=0.570 total time=   0.3s
[CV 3/5] END ...............pca__n_components=2;, score=0.590 total time=   0.2s
[CV 4/5] END ...............pca__n_components=2;, score=0.570 total time=   0.1s
[CV 5/5] END ...............pca__n_components=2;, score=0.585 total time=   0.2s
[CV 1/5] END ...............pca__n_components=3;, score=0.640 total time=   0.2s
[CV 2/5] END ...............pca__n_components=3;, score=0.745 total time=   0.2s
[CV 3/5] END ...............pca__n_components=3;, score=0.710 total time=   0.1s
[CV 4/5] END ...............pca__n_components=3;, score=0.710 total time=   0.2s
[CV 5/5] END ...............pca__n_components=3;, score=0.770 total time=   0.2s
[CV 1/5] END ...............pca__n_components=4;, score=0.715 total time=   0.2s
[CV 2/5] END ...............pca__n_components=4;,

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier',
                                        LogisticRegression(max_iter=500,
                                                           multi_class='multinomial'))]),
             n_jobs=2,
             param_grid={'pca__n_components': array([ 2,  3,  4,  5,  6,  7,  8, 10])},
             verbose=4)

In [6]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.930):
{'pca__n_components': 10}


In [7]:
search.best_estimator_.score(X_train, y_train)

0.969

In [8]:
joblib.dump(search.best_estimator_, 'lr_optimal.joblib')

['lr_optimal.joblib']

In [9]:
#print(search.cv_results_)
results = pd.DataFrame(search.cv_results_)
print(results)
results.to_csv('lr_optimal.csv', sep=',', header=True)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.182121      0.043873         0.000888        0.000276   
1       0.172896      0.039012         0.001013        0.000259   
2       0.191999      0.041252         0.001022        0.000591   
3       0.225085      0.043715         0.000834        0.000118   
4       0.208262      0.035751         0.007945        0.003220   
5       0.244521      0.057697         0.005100        0.002544   
6       0.212782      0.050602         0.005297        0.001528   
7       0.199924      0.045999         0.008822        0.002816   

  param_pca__n_components                     params  split0_test_score  \
0                       2   {'pca__n_components': 2}              0.570   
1                       3   {'pca__n_components': 3}              0.640   
2                       4   {'pca__n_components': 4}              0.715   
3                       5   {'pca__n_components': 5}              0.740   
4                    