In [20]:
%matplotlib inline

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm

In [22]:
letters_df = pd.read_csv("letterdata.csv")

In [8]:
letters_df.head(10)   # all fields except the target ("letter") are numeric. We do not know the scale. So normalize

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
5,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7
6,B,4,2,5,4,4,8,7,6,6,7,6,6,2,8,7,10
7,A,1,1,3,2,1,8,2,2,2,8,2,8,1,6,2,7
8,J,2,2,4,4,2,10,6,2,6,12,4,8,1,6,1,7
9,M,11,15,13,9,7,13,2,6,2,12,1,9,8,1,1,8


In [23]:
X, y = letters_df.drop(columns = 'letter'), letters_df.loc[:,'letter'] 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [26]:
clf = svm.SVC(gamma=0.025, C=3)    
# gamma is a measure of influence of a data point. It is inverse of distance of influence. C is complexity of the model
# lower C value creates simple hyper surface while higher C creates complex surface

In [27]:
clf.fit(X_train , y_train)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.025, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
clf.score(X_test, y_test)

0.9708333333333333

In [28]:
y_pred = clf.predict(X_test)

In [33]:
y_grid = (np.column_stack([y_test, y_pred]))

In [34]:
print(y_grid)

[['D' 'D']
 ['D' 'D']
 ['V' 'V']
 ...
 ['X' 'X']
 ['R' 'R']
 ['J' 'J']]


In [44]:
pd.set_option('display.max_columns', 26)

pd.crosstab(y_pred, y_test)

letter,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
A,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
B,0,239,0,1,1,0,1,2,0,0,1,1,3,1,0,1,0,6,0,0,0,6,0,0,0,0
C,0,0,222,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
D,0,0,0,227,0,0,0,7,0,0,0,0,0,4,5,1,0,0,0,1,0,0,0,1,0,0
E,0,0,2,0,209,2,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,1
F,0,0,0,0,0,213,0,0,1,0,0,0,0,0,0,12,0,0,1,1,0,0,0,0,0,0
G,0,0,0,1,1,0,222,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
H,0,0,0,0,0,0,0,213,0,0,1,3,0,2,0,2,0,1,0,0,0,1,0,0,0,0
I,0,0,0,0,0,1,0,0,192,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
J,0,0,0,0,0,0,0,0,5,189,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 

In [37]:
unmatched = []
for i in range(len(y_grid)):
    if y_grid[i][0] != y_grid[i][1]:
        unmatched.append(i)

In [41]:
y_grid[unmatched]

array([['R', 'E'],
       ['D', 'B'],
       ['P', 'F'],
       ['N', 'B'],
       ['U', 'M'],
       ['P', 'B'],
       ['P', 'F'],
       ['M', 'G'],
       ['K', 'X'],
       ['N', 'D'],
       ['P', 'F'],
       ['M', 'B'],
       ['V', 'B'],
       ['T', 'F'],
       ['I', 'J'],
       ['G', 'B'],
       ['K', 'R'],
       ['L', 'B'],
       ['R', 'H'],
       ['P', 'H'],
       ['F', 'E'],
       ['J', 'I'],
       ['I', 'J'],
       ['K', 'H'],
       ['D', 'R'],
       ['H', 'R'],
       ['N', 'H'],
       ['C', 'L'],
       ['T', 'D'],
       ['R', 'B'],
       ['G', 'C'],
       ['P', 'H'],
       ['D', 'O'],
       ['N', 'M'],
       ['O', 'D'],
       ['J', 'N'],
       ['S', 'F'],
       ['S', 'R'],
       ['N', 'V'],
       ['L', 'E'],
       ['P', 'D'],
       ['R', 'Q'],
       ['P', 'F'],
       ['V', 'B'],
       ['H', 'U'],
       ['R', 'B'],
       ['N', 'H'],
       ['K', 'R'],
       ['N', 'O'],
       ['O', 'C'],
       ['K', 'R'],
       ['F', 'I'],
       ['L',

In [162]:
np.savetxt("d:\greatlakes\ocr.csv", y_grid , fmt='%s')