In [1]:
import csv
import os
import cv2
import glob
import math
import time
import random
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV



In [2]:
'''
Loading data functions
'''
PIXELS = 100
imageSize = PIXELS * PIXELS
num_features = imageSize 

def load_train_cv(encoder):
    X_train = []
    y_train = []
    print('Read train images')
    #Read train ids
    with open('id_train.csv', 'rb') as csvfile:
        trainreader = csv.reader(csvfile, delimiter=',')
        next(trainreader)
        for row in trainreader:
            #print(row[0])
            file_name = os.path.join('input', row[0] + '.jpg')
            img = cv2.imread(file_name,0)
            img = cv2.resize(img, (PIXELS, PIXELS))
            #img = img.transpose(2, 0, 1)
            img = np.reshape(img, (1, num_features))
            X_train.append(img)
            y_train.append(row[1])
    #print X_train.shape
    X_train = np.array(X_train)
    y_train = np.array(y_train).astype('int32')

    #y_train = encoder.fit_transform(y_train).astype('int32')

    X_train, y_train = shuffle(X_train, y_train)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

    X_train = X_train.reshape(X_train.shape[0], num_features).astype('float32') / 255.
    X_test = X_test.reshape(X_test.shape[0], num_features).astype('float32') / 255.
    
    #X_train = X_train.astype('float32') / 255.
    #X_test = X_test.astype('float32') / 255.

    return X_train, y_train, X_test, y_test, encoder

def load_test():
    print('Read test images')
    X_test = []
    X_test_id=[]
    with open('sample_submission4.csv', 'rb') as csvfile:
        testreader = csv.reader(csvfile, delimiter=',')
        next(testreader)
        for row in testreader:
            #print('Load folder c{}'.format(j))
            file_name = os.path.join('input', row[0] + '.jpg')
            img = cv2.imread(file_name,0)
            img = cv2.resize(img, (PIXELS, PIXELS))
            #img = img.transpose(2, 0, 1)
            img = np.reshape(img, (1, num_features))
            X_test.append(img)
            X_test_id.append(row[0])

    X_test = np.array(X_test)
    X_test_id = np.array(X_test_id)

    X_test = X_test.reshape(X_test.shape[0],num_features ).astype('float32') / 255.

    return X_test, X_test_id

In [3]:

'''
load training data and test data 
'''
encoder = LabelEncoder()

# load the training and validation data sets
train_X, train_y, valid_X, valid_y, encoder = load_train_cv(encoder)
print('Train shape:', train_X.shape, 'Dev (valid) shape:', valid_X.shape)
print('Train shape:', train_y.shape, 'Dev (valid) shape:', valid_y.shape)

# load data
X_test, X_test_id = load_test()

Read train images
('Train shape:', (5600L, 10000L), 'Dev (valid) shape:', (2400L, 10000L))
('Train shape:', (5600L,), 'Dev (valid) shape:', (2400L,))
Read test images


In [4]:
'''
KNN
'''
def knn(k_values):

    for i in range(0,len(k_values)):
        correct, total = 0, 0
        knc = KNeighborsClassifier(n_neighbors=k_values[i])
        knc.fit(train_X,train_y)
        preds = knc.predict(valid_X)
        cfm = confusion_matrix(preds,valid_y)
        #confusion matrix and print
        #print cfm
        for pred, label in zip(preds, valid_y):
            if pred == label: correct += 1
            total += 1
        print 'K-value: %3d total: %3d  correct: %3d  accuracy: %3.2f' %(k_values[i],total, correct, 1.0*correct/total)
        if k_values[i]==1:
            kncr = classification_report(preds,valid_y)
            print "For K=1, precision, recall and f1-score for all digits shown below"
            print kncr

    
k_values = [1, 3, 5, 7, 9]
knn(k_values)

K-value:   1 total: 2400  correct: 1220  accuracy: 0.51
For K=1, precision, recall and f1-score for all digits shown below
             precision    recall  f1-score   support

          1       0.67      0.65      0.66      1073
          2       0.52      0.47      0.49       621
          3       0.25      0.31      0.28       198
          4       0.30      0.33      0.32       508

avg / total       0.52      0.51      0.51      2400

K-value:   3 total: 2400  correct: 1315  accuracy: 0.55
K-value:   5 total: 2400  correct: 1344  accuracy: 0.56
K-value:   7 total: 2400  correct: 1388  accuracy: 0.58
K-value:   9 total: 2400  correct: 1376  accuracy: 0.57


In [5]:
#Let's go with n_neighbors=7

#Apply Gaussian Blurring

def GaussianBlur(data):
    
    blur = np.zeros(data.shape)
    for k in range(0,len(data)):
        data_mat = np.ndarray.reshape(data[k],(-1,PIXELS))

        for i in range(0,PIXELS):
            for j in range(0,PIXELS):
                if data_mat[i][j] >0 :
                    try:
                        
                        #Gather values from the 8 adjacent cells
                        val = np.array([data_mat[i-1][j-1],data_mat[i-1][j],data_mat[i-1][j+1] \
                                   ,data_mat[i][j-1],data_mat[i][j],data_mat[i][j+1] \
                                   ,data_mat[i+1][j-1],data_mat[i+1][j],data_mat[i+1][j+1]])
                        
                        #Compute standard deviation
                        stdev = np.std(val)
                        if stdev==0:
                            blur[k:k+1][0][PIXELS*i+j] = data_mat[i][j] #No weighting if all neighbors have same value
                            continue                           
                        
                        #Copmute neighbors weights and self weights using gaussian distance
                        neighbor_weight = (1/pow(2*np.pi*pow(stdev,2),0.5))*np.exp(-1/(2*pow(stdev,2)))
                        self_weight = (1/pow(2*np.pi*pow(stdev,2),0.5))*np.exp(0)
                        weights = [neighbor_weight,neighbor_weight,neighbor_weight \
                                   ,neighbor_weight,self_weight,neighbor_weight \
                                  ,neighbor_weight,neighbor_weight,neighbor_weight]
                        
                        #Apply Blurring weighted average
                        blur[k:k+1][0][PIXELS*i+j] = np.sum(val * weights) / sum(val) #weighted average
                    except IndexError:
                        blur[k:k+1][0][PIXELS*i+j] = data_mat[i][j] #No weighting for border cells
    
    return blur

blurred_valid_data = GaussianBlur(valid_X)
blurred_train_data = GaussianBlur(train_X)


KeyboardInterrupt: 

In [6]:
#Apply Blurring to only train and not validation
correct, total = 0, 0
knc = KNeighborsClassifier(n_neighbors=7)
knc.fit(blurred_train_data,train_y)
preds = knc.predict(valid_X)
for pred, label in zip(preds, valid_y):
    if pred == label: correct += 1
    total += 1
print 'K-value: %3d total: %3d  correct: %3d  accuracy: %3.2f' %(5,total, correct, 1.0*correct/total)

#Apply Blurring to only validation and not train
correct, total = 0, 0
knc = KNeighborsClassifier(n_neighbors=7)
knc.fit(train_X,train_y)
preds = knc.predict(blurred_valid_data)
for pred, label in zip(preds, valid_y):
    if pred == label: correct += 1
    total += 1
print 'K-value: %3d total: %3d  correct: %3d  accuracy: %3.2f' %(5,total, correct, 1.0*correct/total)

#Apply Blurring to both
correct, total = 0, 0
knc = KNeighborsClassifier(n_neighbors=7)
knc.fit(blurred_train_data,train_y)
preds = knc.predict(blurred_valid_data)
for pred, label in zip(preds, valid_y):
    if pred == label: correct += 1
    total += 1
print 'K-value: %3d total: %3d  correct: %3d  accuracy: %3.2f' %(5,total, correct, 1.0*correct/total)

K-value:   5 total: 2400  correct: 565  accuracy: 0.24
K-value:   5 total: 2400  correct: 513  accuracy: 0.21
K-value:   5 total: 2400  correct: 1278  accuracy: 0.53


In [7]:
'''
Let's try Gaussian Naive Bayes Classifier
'''
def alphaNB(alphas):

    #Grid Search
    clf = GridSearchCV(BernoulliNB(), alphas)
    clf.fit(train_X, train_y)
    preds = clf.predict(valid_X)
    
    #Check Accuracy
    correct,total=0,0
    for pred, label in zip(preds, valid_y):
        if pred == label: correct += 1
        total += 1
    accuracy = 1.0*correct/total
    print "data model accuracy %s" % accuracy
    print clf.best_estimator_



#alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
alphas = {'alpha': [ 0.0001, 0.001, 0.01, 0.1]}
nb = alphaNB(alphas)


data model accuracy 0.420833333333
BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)


In [None]:
Naive Bayes performing poorly at alpha=0.01

In [13]:
'''
Check calibration level for Gaussian NB
'''

def NB_Calibration(buckets, correct, total):
    
    
    #Fit model and obtian posterior probabilities
    clf = BernoulliNB(alpha=0.01)
    clf.fit(train_X,train_y)
    prob = np.exp(clf.feature_log_prob_)
    preds = clf.predict(valid_X)
    preds_prob = clf.predict_proba(valid_X)
    
    #Bucket the posterior probs and check for accuracy
    for pred,label,posteriors in zip(preds, valid_y, preds_prob):
        max_prob = np.max(posteriors)
        bucket_index = np.digitize(max_prob, buckets) #based on max_prob
        if pred == label: correct[bucket_index-1] += 1.0
        total[bucket_index-1] += 1.0


buckets = [0.5, 0.9, 0.999, 0.99999, 0.9999999, 0.999999999, 0.99999999999, 0.9999999999999, 1.0]
correct = [0 for i in buckets]
total = [0 for i in buckets]

NB_Calibration(buckets, correct, total)

for i in range(len(buckets)):
    accuracy = 0.0
    if (total[i] > 0): accuracy = correct[i] / total[i]
    print 'p(pred) <= %.13f    total = %3d    accuracy = %.3f' %(buckets[i], total[i], accuracy)

p(pred) <= 0.5000000000000    total =   2    accuracy = 0.500
p(pred) <= 0.9000000000000    total =   8    accuracy = 0.500
p(pred) <= 0.9990000000000    total =   0    accuracy = 0.000
p(pred) <= 0.9999900000000    total =   0    accuracy = 0.000
p(pred) <= 0.9999999000000    total =   0    accuracy = 0.000
p(pred) <= 0.9999999990000    total =   0    accuracy = 0.000
p(pred) <= 0.9999999999900    total =   0    accuracy = 0.000
p(pred) <= 0.9999999999999    total =   0    accuracy = 0.000
p(pred) <= 1.0000000000000    total = 2390    accuracy = 0.440


In [None]:
NB classifier is weakly claibrated.

In [63]:
'''
Create Submissions - Best model so far KNN (n=7)
'''

import csv

#make predictions
print('Making predictions')
knc = KNeighborsClassifier(n_neighbors=7)
knc.fit(train_X,train_y)
preds = knc.predict(X_test)

def create_submission(predictions, test_id):
     with open('submit.csv', 'wb') as mycsvfile:
            thedatawriter = csv.writer(mycsvfile)
            thedatawriter.writerow(['Id','label'])
            for pred,testid  in zip(predictions,test_id):
                out = [testid,pred]
                thedatawriter.writerow(out)
    

print('Creating Submission')
create_submission(preds, X_test_id)

Creating Submission
