In [1]:
import numpy as np
import pandas as pd
import sys
import scipy.io as sio
import matplotlib
import matplotlib.pyplot as plt
from numpy.matlib import repmat
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.set_printoptions(threshold=sys.maxsize)

%matplotlib inline



### Data Loading

In [2]:
# load datas 
data = pd.read_csv("./chineseMNIST.csv")

In [3]:
data

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_4088,pixel_4089,pixel_4090,pixel_4091,pixel_4092,pixel_4093,pixel_4094,pixel_4095,label,character
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,九
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,九
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,九
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,九
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,九
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,八
14996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,八
14997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,八
14998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,八


In [4]:
# seperate data and label
# label 
infoData  = data.iloc[: , -2:]
labelData = infoData.iloc[: , 0]
charData = infoData.iloc[: , 1]

# data 
pixelData = data.iloc[: , :-2]

## Helper function

In [5]:
def viewdfrow(pdrow):
    rowvector = pd.to_numeric(pdrow.to_numpy())
    plt.imshow(rowvector.reshape([64, 64]))

In [6]:
def viewrow(rowvector):
    plt.imshow(rowvector.reshape([64, 64]))

In [7]:
def labelToNum(label):
    if label <= 10:
        return label
    elif label == 11:
        return 100
    elif label == 12:
        return 1000
    elif label == 13:
        return 10000
    elif label == 14:
        return 100000000

### To work in a smaller set 

In [8]:
# We use X_test and y_test as the whole data set 
X_train, X_test, y_train, y_test = train_test_split(pixelData, labelData, test_size=0.05)

In [9]:
print('shape of X_train: ' + str(X_train.shape))
print('shape of X_test: ' + str(X_test.shape))
print('shape of y_train: ' + str(y_train.shape))
print('shape of y_test: ' + str(y_test.shape))

shape of X_train: (14250, 4096)
shape of X_test: (750, 4096)
shape of y_train: (14250,)
shape of y_test: (750,)


### Scale data

In [11]:
scaled_test_data = preprocessing.scale(X_test)
scaled_train_data = preprocessing.scale(X_train)

### Use PCA to reduce Dimensionality

In [13]:
pca = PCA(0.95) 
pca.fit(scaled_train_data)  
low_train_data = pca.transform(scaled_train_data)  

In [23]:
low_test_data = pca.transform(scaled_test_data)  

In [15]:
print("origional image is a " + str(scaled_train_data.shape[1]) + " dimension image")
print("new image is a " + str(low_train_data.shape[1]) + " dimension image")

origional image is a 4096 dimension image
new image is a 1168 dimension image


## Run KNN to clustering the data 

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [20]:
def knnrunner(neighbors,X_train,y_train,X_test):
    knn = KNeighborsClassifier(n_neighbors=neighbors)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    return y_pred

In [21]:
def accuracyCalculator(y_test, y_pred,n):
    score = str(metrics.accuracy_score(y_test, y_pred))
    result = "Accuracy for neighbor = " + str(n) + " = " + score
    print(result)
    

In [26]:
for i in range(20):
    if i == 0: continue
    y_pred = knnrunner(i,low_train_data,y_train,low_test_data)
    accuracyCalculator(y_test, y_pred,i)


Accuracy for neighbor = 1 = 0.5986666666666667
Accuracy for neighbor = 2 = 0.5186666666666667
Accuracy for neighbor = 3 = 0.5386666666666666
Accuracy for neighbor = 4 = 0.536
Accuracy for neighbor = 5 = 0.536
Accuracy for neighbor = 6 = 0.5386666666666666
Accuracy for neighbor = 7 = 0.5373333333333333
Accuracy for neighbor = 8 = 0.5293333333333333
Accuracy for neighbor = 9 = 0.5186666666666667
Accuracy for neighbor = 10 = 0.5226666666666666
Accuracy for neighbor = 11 = 0.5186666666666667
Accuracy for neighbor = 12 = 0.512
Accuracy for neighbor = 13 = 0.5013333333333333
Accuracy for neighbor = 14 = 0.5013333333333333
Accuracy for neighbor = 15 = 0.492
Accuracy for neighbor = 16 = 0.4866666666666667
Accuracy for neighbor = 17 = 0.488
Accuracy for neighbor = 18 = 0.4866666666666667
Accuracy for neighbor = 19 = 0.4866666666666667
