## Load data

In [1]:
from sklearn import datasets
import pandas as pd

digits = datasets.load_digits()

In [2]:
print(digits.DESCR)

Optical Recognition of Handwritten Digits Data Set

Notes
-----
Data Set Characteristics:
    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each block. This generates
an input matrix of 8x8 where each element is a

In [3]:
dat = pd.DataFrame(digits.data)
dat.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
5,0.0,0.0,12.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,9.0,16.0,16.0,10.0,0.0,0.0
6,0.0,0.0,0.0,12.0,13.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,1.0,9.0,15.0,11.0,3.0,0.0
7,0.0,0.0,7.0,8.0,13.0,16.0,15.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,13.0,5.0,0.0,0.0,0.0,0.0
8,0.0,0.0,9.0,14.0,8.0,1.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,11.0,16.0,15.0,11.0,1.0,0.0
9,0.0,0.0,11.0,12.0,0.0,0.0,0.0,0.0,0.0,2.0,...,4.0,0.0,0.0,0.0,9.0,12.0,13.0,3.0,0.0,0.0


In [4]:
target = pd.DataFrame(digits.target, columns = ['target'])
target.head()

Unnamed: 0,target
0,0
1,1
2,2
3,3
4,4


In [5]:
import matplotlib.pyplot as plt

images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Digit %i' % label)

## Analyse data

In [6]:
df = dat.copy()
df = pd.concat([df, target], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 65 columns):
0         1797 non-null float64
1         1797 non-null float64
2         1797 non-null float64
3         1797 non-null float64
4         1797 non-null float64
5         1797 non-null float64
6         1797 non-null float64
7         1797 non-null float64
8         1797 non-null float64
9         1797 non-null float64
10        1797 non-null float64
11        1797 non-null float64
12        1797 non-null float64
13        1797 non-null float64
14        1797 non-null float64
15        1797 non-null float64
16        1797 non-null float64
17        1797 non-null float64
18        1797 non-null float64
19        1797 non-null float64
20        1797 non-null float64
21        1797 non-null float64
22        1797 non-null float64
23        1797 non-null float64
24        1797 non-null float64
25        1797 non-null float64
26        1797 non-null float64
27        1797 non-null float

In [8]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496,4.490818
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122,2.865304
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0,4.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0,7.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0,9.0


In [9]:
corr_matrix = df.corr()
corr_matrix['target']

0              NaN
1        -0.051834
2        -0.011836
3        -0.011489
4         0.100801
5         0.193362
6         0.197343
7         0.101085
8         0.020813
9        -0.012439
10        0.120104
11       -0.129067
12       -0.244489
13        0.076821
14        0.209664
15        0.080925
16       -0.023225
17       -0.026982
18        0.097094
19       -0.140974
20       -0.135314
21        0.113816
22        0.078895
23        0.026063
24       -0.023199
25       -0.169231
26        0.072729
27        0.275468
28        0.234159
29        0.216130
            ...   
35        0.265880
36        0.162342
37        0.161855
38       -0.006153
39             NaN
40       -0.010511
41       -0.137694
42       -0.122766
43       -0.006342
44       -0.024336
45        0.014753
46        0.060820
47        0.046754
48       -0.011780
49       -0.039103
50       -0.142980
51       -0.181037
52       -0.390625
53       -0.189452
54        0.026682
55       -0.099312
56       -0.

### Remove less correlated variables

In [10]:
newCol = []
for i, col in enumerate(df.columns):
    if corr_matrix['target'][i] > 0.1 or corr_matrix['target'][i] < -0.1:
        newCol.append(col)

dat1 = df.loc[:, newCol]
print('Shape: {}'.format(dat1.shape))
dat1.head()

Shape: (1797, 29)


Unnamed: 0,4,5,6,7,10,11,12,14,19,20,...,41,42,50,51,52,53,60,62,63,target
0,9.0,1.0,0.0,0.0,13.0,15.0,10.0,5.0,2.0,0.0,...,4.0,11.0,14.0,5.0,10.0,12.0,10.0,0.0,0.0,0
1,13.0,5.0,0.0,0.0,0.0,11.0,16.0,0.0,15.0,16.0,...,0.0,1.0,1.0,16.0,16.0,6.0,16.0,0.0,0.0,1
2,15.0,12.0,0.0,0.0,3.0,16.0,15.0,0.0,13.0,8.0,...,9.0,16.0,13.0,16.0,16.0,11.0,11.0,9.0,0.0,2
3,13.0,1.0,0.0,0.0,13.0,6.0,15.0,0.0,13.0,13.0,...,0.0,0.0,8.0,4.0,5.0,14.0,13.0,0.0,0.0,3
4,11.0,0.0,0.0,0.0,0.0,7.0,8.0,0.0,13.0,6.0,...,4.0,15.0,0.0,3.0,15.0,10.0,16.0,0.0,0.0,4


# Apply PCA

In [11]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=20, whiten = True)
pca.fit(dat1)
dat2 = pca.transform(dat1)

## Split into train and test sets

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

dat = StandardScaler().fit_transform(dat)
dat1 = StandardScaler().fit_transform(dat1)

X_train, X_test, y_train, y_test = train_test_split(dat, target, test_size = 0.2, random_state=42)
y_train = y_train.values.ravel()

X_train1, X_test1, y_train1, y_test1 = train_test_split(dat1, target, test_size = 0.2, random_state=42)
y_train1 = y_train1.values.ravel()

X_train2, X_test2, y_train2, y_test2 = train_test_split(dat2, target, test_size = 0.2, random_state=42)
y_train2 = y_train2.values.ravel()

## Cross validation to find best algorithm 

In [13]:
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))

In [15]:
# with complete dataset

import time

scoring = 'accuracy'

results = []
names = []
timer = []
print('Model | Mean of CV | Std. Dev. of CV | Time')
for name, model in models:
    start_time = time.time()
    kfold = model_selection.KFold(n_splits=5, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f) %f s" % (name, cv_results.mean(), cv_results.std(), t)
    print(msg)

Model | Mean of CV | Std. Dev. of CV | Time
LR: 0.956848 (0.014550) 2.792000 s
SVC: 0.979822 (0.008057) 1.515000 s
KNN: 0.972856 (0.010644) 0.642000 s
DT: 0.846917 (0.012666) 0.228000 s


In [16]:
# after removing less correlated features

scoring = 'accuracy'

results = []
names = []
timer = []
print('Model | Mean of CV | Std. Dev. of CV | Time')
for name, model in models:
    start_time = time.time()
    kfold = model_selection.KFold(n_splits=5, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train1, y_train1, cv=kfold, scoring=scoring)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f) %f s" % (name, cv_results.mean(), cv_results.std(), t)
    print(msg)

Model | Mean of CV | Std. Dev. of CV | Time
LR: 0.961718 (0.006642) 1.163000 s
SVC: 0.988872 (0.008632) 0.852000 s
KNN: 0.972171 (0.012828) 0.340000 s
DT: 0.997213 (0.005575) 0.084000 s


In [17]:
# with PCA

scoring = 'accuracy'

results = []
names = []
timer = []
print('Model | Mean of CV | Std. Dev. of CV | Time')
for name, model in models:
    start_time = time.time()
    kfold = model_selection.KFold(n_splits=5, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train2, y_train2, cv=kfold, scoring=scoring)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f) %f s" % (name, cv_results.mean(), cv_results.std(), t)
    print(msg)

Model | Mean of CV | Std. Dev. of CV | Time
LR: 0.919962 (0.012128) 0.607000 s
SVC: 0.970770 (0.006826) 1.048000 s
KNN: 0.954774 (0.005761) 0.311000 s
DT: 0.808641 (0.010094) 0.376000 s


## Evaluate 

In [18]:
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

In [23]:
clf = DecisionTreeClassifier()
clf.fit(X_train1,y_train1);

pred = clf.predict(X_test1)
cm = confusion_matrix(y_test1, pred)  
print('Confusion matrix: \n{}'.format(cm))
print("Accuracy: {}". format(accuracy_score(y_test1, pred)))

Confusion matrix: 
[[33  0  0  0  0  0  0  0  0  0]
 [ 0 28  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0 34  0  0  0  0  0  0]
 [ 0  0  0  0 46  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  0]
 [ 0  0  0  0  0  0 35  0  0  0]
 [ 0  0  0  0  0  0  0 34  0  0]
 [ 0  0  0  0  0  0  0  0 30  0]
 [ 0  0  0  0  0  0  0  0  0 40]]
Accuracy: 1.0


In [25]:
# with complete dataset

results = []
names = []
timer = []
print('Model | Score (Time)')
for name, model in models:
    start_time = time.time()
    clf = model
    clf.fit(X_train,y_train);
    pred = clf.predict(X_test)
    result = accuracy_score(y_test, pred)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(result)
    names.append(name)
    msg = "%s: %f (%f s)" % (name, result, t)
    print(msg)

Model | Score (Time)
LR: 0.966667 (0.863000 s)
SVC: 0.980556 (0.640000 s)
KNN: 0.975000 (0.259000 s)
DT: 0.838889 (0.079000 s)


In [26]:
# after removing less correlated features

results = []
names = []
timer = []
print('Model | Score (Time)')
for name, model in models:
    start_time = time.time()
    clf = model
    clf.fit(X_train1,y_train1);
    pred = clf.predict(X_test1)
    result = accuracy_score(y_test1, pred)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(result)
    names.append(name)
    msg = "%s: %f (%f s)" % (name, result, t)
    print(msg)

Model | Score (Time)
LR: 0.969444 (0.466000 s)
SVC: 0.988889 (0.400000 s)
KNN: 0.977778 (0.104000 s)
DT: 1.000000 (0.016000 s)


In [29]:
# with PCA

results = []
names = []
timer = []
print('Model | Score (Time)')
for name, model in models:
    start_time = time.time()
    clf = model
    clf.fit(X_train2,y_train2);
    pred = clf.predict(X_test2)
    result = accuracy_score(y_test2, pred)
    t = (time.time() - start_time)
    timer.append(t)
    results.append(result)
    names.append(name)
    msg = "%s: %f (%f s)" % (name, result, t)
    print(msg)

Model | Score (Time)
LR: 0.947222 (0.145000 s)
SVC: 0.980556 (0.450000 s)
KNN: 0.963889 (0.094000 s)
DT: 0.797222 (0.105000 s)


## Save

In [None]:
from sklearn.externals import joblib 
joblib.dump(clf, 'digit_classifier.pkl')

In [None]:
clf = joblib.load('digit_classifier.pkl')