In [1]:
#NAME-RISHI KUMAR
#CWID- 20015656
#importing library

import numpy as np
import pandas as pd

In [2]:
"""The objective is to identify each of a large number of black-and-white rectangular pixel displays as one of the 
26 capital letters in the English alphabet. The character images were based on 20 different fonts and each letter 
within these 20 fonts was randomly distorted to produce a file of 20,000 unique stimuli. Each stimulus was converted 
into 16 primitive numerical attributes (statistical moments and edge counts) which were then scaled to fit into a range
of integer values from 0 through 15. We typically train on the first 16000 items and then use the resulting model to 
predict the letter category for the remaining 4000. See the article cited above for more details.


Attribute Information:

1. lettr capital letter (26 values from A to Z)
2. x-box horizontal position of box (integer)
3. y-box vertical position of box (integer)
4. width width of box (integer)
5. high height of box (integer)
6. onpix total # on pixels (integer)
7. x-bar mean x of on pixels in box (integer)
8. y-bar mean y of on pixels in box (integer)
9. x2bar mean x variance (integer)
10. y2bar mean y variance (integer)
11. xybar mean x y correlation (integer)
12. x2ybr mean of x * x * y (integer)
13. xy2br mean of x * y * y (integer)
14. x-ege mean edge count left to right (integer)
15. xegvy correlation of x-ege with y (integer)
16. y-ege mean edge count bottom to top (integer)
17. yegvx correlation of y-ege with x (integer)"""

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data"

names = ['capital letter',' x-box horizontal position of box','y-box vertical position of box','width width of box','high height of box','onpix total # on pixels','x-bar mean x of on pixels in box',' y-bar mean y of on pixels in box ','x2bar mean x variance','y2bar mean y variance','xybar mean x y correlation','x2ybr mean of x * x * y','xy2br mean of x * y * y',' x-ege mean edge count left to right','xegvy correlation of x-ege with y',' y-ege mean edge count bottom to top','yegvx correlation of y-ege with x']

dataset = pd.read_csv(url, names=names)

In [3]:
# viewing the dataset
dataset.head()


Unnamed: 0,capital letter,x-box horizontal position of box,y-box vertical position of box,width width of box,high height of box,onpix total # on pixels,x-bar mean x of on pixels in box,y-bar mean y of on pixels in box,x2bar mean x variance,y2bar mean y variance,xybar mean x y correlation,x2ybr mean of x * x * y,xy2br mean of x * y * y,x-ege mean edge count left to right,xegvy correlation of x-ege with y,y-ege mean edge count bottom to top,yegvx correlation of y-ege with x
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [12]:
#splitting a dataset into its attributes and labels
X= dataset.iloc[:,1:16].values
y= dataset.iloc[:,0].values
X

array([[ 2,  8,  3, ...,  0,  8,  0],
       [ 5, 12,  3, ...,  2,  8,  4],
       [ 4, 11,  6, ...,  3,  7,  3],
       ...,
       [ 6,  9,  6, ...,  2, 12,  2],
       [ 2,  3,  4, ...,  1,  9,  5],
       [ 4,  9,  6, ...,  2,  7,  2]], dtype=int64)

In [11]:
#creating training and testing splits
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.30)
print(X_train)



[[ 3  2  3 ...  2  8  6]
 [ 5  9  7 ... 13 10  3]
 [ 6  9  6 ...  1 11  2]
 ...
 [ 4  6  4 ...  3  9  7]
 [ 2  3  3 ...  1 10  2]
 [ 6 11  8 ...  6  9  1]]


In [6]:
#for normalization
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)

X_train


array([[-2.10700231, -0.31333349, -2.54475037, ..., -1.31261804,
        -0.21939529, -1.44206618],
       [-0.01749959,  0.89661516,  0.43173163, ...,  1.26012189,
         0.42656351, -0.66500115],
       [-0.01749959, -0.01084633,  0.43173163, ...,  0.8313319 ,
         0.42656351, -0.66500115],
       ...,
       [ 0.5048761 ,  0.89661516,  0.43173163, ..., -0.02624807,
        -0.21939529,  0.11206388],
       [-1.58462663, -1.22079498, -1.05650937, ..., -0.88382805,
         0.42656351, -1.05353367],
       [ 1.02725178,  0.89661516,  1.42389229, ...,  2.11770187,
        -1.51131291, -0.66500115]])

In [7]:
#KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [8]:
#prediction
Y_pred= classifier.predict(X_test)

In [9]:
#evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print("clasification report: \n ",classification_report(y_test,Y_pred))
print("confusion matrix: \n ",confusion_matrix(y_test, Y_pred))

clasification report: 
                precision    recall  f1-score   support

           A       0.97      0.96      0.96       241
           B       0.84      0.93      0.88       223
           C       0.94      0.95      0.95       229
           D       0.83      0.95      0.88       249
           E       0.91      0.92      0.91       242
           F       0.85      0.88      0.87       212
           G       0.90      0.89      0.89       234
           H       0.87      0.77      0.81       214
           I       0.97      0.95      0.96       235
           J       0.96      0.94      0.95       219
           K       0.89      0.93      0.91       219
           L       0.94      0.95      0.95       232
           M       0.98      0.94      0.96       234
           N       0.96      0.92      0.94       245
           O       0.86      0.93      0.89       217
           P       0.97      0.90      0.93       273
           Q       0.95      0.90      0.93       241
  