In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict
import csv
from sklearn.utils import shuffle
from random import randint
%matplotlib inline

In [2]:
def display_digit(digit):
    """ 
    graphically displays a 784x1 vector, representing a digit
    """
    image = digit
    plt.figure()
    fig = plt.imshow(image.reshape(28,28))
    fig.set_cmap('gray_r')
    fig.axes.get_xaxis().set_visible(False)
    fig.axes.get_yaxis().set_visible(False)
    

# MNIST

In [4]:
mnist = fetch_mldata('MNIST original', data_home="/Users/sasankauppu/Desktop/Data Mining CS6220/")

mn_X = mnist.data.reshape([70000,28,28])

mn_y_train = mnist.target[:60000]
mn_y_test = mnist.target[-10000:]

del(mnist)

In [5]:
def black_pixel_count(images):
    bpcList=[]
    for mat in images:
        black_count={}

        if(mat[0][0])>0:
            black_count["0-0"]=1
        else:
            black_count["0-0"]=0

        for i in range(1,mat.shape[0]):
            if(mat[i][0])>0:
                black_count[str(i)+"-0"]=black_count[str(i-1)+"-0"]+1
            else:
                black_count[str(i)+"-0"]=black_count[str(i-1)+"-0"]

        for j in range(1,mat.shape[1]):
            if(mat[0][j])>0:
                black_count["0-"+str(j)]=black_count["0-"+str(j-1)]+1
            else:
                black_count["0-"+str(j)]=black_count["0-"+str(j-1)]


        for i in range(1,mat.shape[0]):
            for j in range(1,mat.shape[1]):
                black_count[str(i)+"-"+str(j)]=black_count[str(i)+"-"+str(j-1)]+black_count[str(i-1)+"-"+str(j)]-black_count[str(i-1)+"-"+str(j-1)]
                if(mat[i][j])>0:
                    black_count[str(i)+"-"+str(j)]+=1
        bpcList.append(black_count)
    return bpcList

In [6]:
bpcList=black_pixel_count(mn_X)

In [7]:
def generate_rectangles():
    rects=[]
    for i in range(100):
        x=0
        y=0
        l=0
        b=0
        while (True):
            if(100<l*b<250 and (x+l<28 or x-l>0) and (y+b<28 or y-b>0)):
                break
            x=randint(0,27)
            y=randint(0,27)
            l=randint(5,20)
            b=randint(5,20)
        rects.append((x,y,l,b))
    return rects

In [12]:
def get_features(bpcList):
    mn_harr_X=[]
    rects=generate_rectangles()
    
    for black_count in bpcList:
        feature=[]
        for x,y,l,b in rects:
            if x+l<28 and y+b<28:
                x1=x
                x2=x+l
                y1=y
                y2=y+b
            elif x+l<28 and y+b>28:
                x1=x
                x2=x+l
                y1=y-b
                y2=y
            if x+l>28 and y+b<28:
                x1=x-l
                x2=x
                y1=y
                y2=y+b
            if x+l>28 and y+b>28:
                x1=x-l
                x2=x
                y1=y-b
                y2=y
                
            hx=int((x1+x2)/2)
            hy=int((y1+y2)/2)
            
            hf1=black_count[str(x2)+"-"+str(hy)]-black_count[str(x2)+"-"+str(y1)]-black_count[str(x1)+"-"+str(hy)]+black_count[str(x1)+"-"+str(y1)]
            hf2=black_count[str(x2)+"-"+str(y2)]-black_count[str(x2)+"-"+str(hy)]-black_count[str(x1)+"-"+str(y2)]+black_count[str(x1)+"-"+str(hy)]
            f1=hf1-hf2
            
            vf1=black_count[str(hx)+"-"+str(y2)]-black_count[str(hx)+"-"+str(y1)]-black_count[str(x1)+"-"+str(y2)]+black_count[str(x1)+"-"+str(y1)]
            vf2=black_count[str(x2)+"-"+str(y2)]-black_count[str(x2)+"-"+str(y1)]-black_count[str(hx)+"-"+str(y2)]+black_count[str(hx)+"-"+str(y1)]
            f2=vf1-vf2
            
            feature.append(f1)
            feature.append(f2)
        mn_harr_X.append(feature)
        
    return mn_harr_X

In [13]:
mn_X=get_features(bpcList)

In [17]:
mn_X_train=mn_X[:60000]
mn_X_test=mn_X[-10000:]

print np.asarray(mn_X_train).shape
print np.asarray(mn_X_test).shape

(60000, 200)
(10000, 200)


In [18]:
logmodel = LogisticRegression(penalty='l2',verbose=1,solver = 'lbfgs')
logmodel.fit(mn_X_train, mn_y_train)

print "Training accuracy: ",logmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",logmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.89975
Testing accuracy:  0.9065


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.1s finished


In [19]:
dtmodel = DecisionTreeClassifier(max_depth=32,min_samples_split=20,min_samples_leaf=10)
dtmodel.fit(mn_X_train,mn_y_train)

print "Training accuracy: ",dtmodel.score(mn_X_train,mn_y_train)
print "Testing accuracy: ",dtmodel.score(mn_X_test,mn_y_test)

Training accuracy:  0.92425
Testing accuracy:  0.8825


In [None]:
mn_X_test=None
mn_X_train=None
mn_y_test=None
mn_y_train=None