# Handwritten Digits Recognition using Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [2]:
# split dataset into 50/50 according to custom in Digits dataset
X_train = np.load('/Users/noaschwensfeier/python_ml/Machine_learning_Project/dataset/X_train.npy')
y_train = np.load('/Users/noaschwensfeier/python_ml/Machine_learning_Project/dataset/y_train.npy')

In [3]:
#specify instances of classes for each processing step
scaler = StandardScaler()
pca = PCA(svd_solver='auto')
classifier = LogisticRegression(max_iter=500, multi_class='multinomial')

#combine in pipeline
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('classifier', classifier)])

In [4]:
#specify parameter space to search
param_grid = {
    'pca__n_components': np.linspace(2, 10, 8, dtype=int),
}

In [5]:
search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=2, cv=5, verbose=4, refit=True)
search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 2/5] END ...............pca__n_components=2;, score=0.605 total time=   0.2s
[CV 1/5] END ...............pca__n_components=2;, score=0.570 total time=   0.2s
[CV 3/5] END ...............pca__n_components=2;, score=0.590 total time=   0.5s
[CV 4/5] END ...............pca__n_components=2;, score=0.570 total time=   0.5s
[CV 5/5] END ...............pca__n_components=2;, score=0.585 total time=   0.5s
[CV 1/5] END ...............pca__n_components=3;, score=0.640 total time=   0.5s
[CV 3/5] END ...............pca__n_components=3;, score=0.710 total time=   0.4s
[CV 2/5] END ...............pca__n_components=3;, score=0.745 total time=   0.5s
[CV 4/5] END ...............pca__n_components=3;, score=0.710 total time=   0.2s
[CV 5/5] END ...............pca__n_components=3;, score=0.770 total time=   0.3s
[CV 1/5] END ...............pca__n_components=4;, score=0.715 total time=   0.4s
[CV 2/5] END ...............pca__n_components=4;,

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier',
                                        LogisticRegression(max_iter=500,
                                                           multi_class='multinomial'))]),
             n_jobs=2,
             param_grid={'pca__n_components': array([ 2,  3,  4,  5,  6,  7,  8, 10])},
             verbose=4)

In [6]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.930):
{'pca__n_components': 10}


In [7]:
search.best_estimator_.score(X_train, y_train)

0.969

In [8]:
joblib.dump(search.best_estimator_, 'lr_optimal.joblib')

['lr_optimal.joblib']

In [9]:
#print(search.cv_results_)
results = pd.DataFrame(search.cv_results_)
print(results)
results.to_csv('lr_optimal.csv', sep=',', header=True)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.382061      0.158283         0.002033        0.000476   
1       0.359876      0.116450         0.003088        0.001352   
2       0.340940      0.103788         0.002229        0.001304   
3       0.397264      0.090593         0.003976        0.004244   
4       0.649411      0.180984         0.003298        0.001896   
5       0.358652      0.029090         0.001926        0.000468   
6       0.348344      0.066294         0.002511        0.001009   
7       0.266402      0.022320         0.002726        0.001915   

  param_pca__n_components                     params  split0_test_score  \
0                       2   {'pca__n_components': 2}              0.570   
1                       3   {'pca__n_components': 3}              0.640   
2                       4   {'pca__n_components': 4}              0.715   
3                       5   {'pca__n_components': 5}              0.740   
4                    