In [11]:
import numpy as np

from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [12]:
def runLR(xtrain, ytrain, xtest, ytest):
    lr = LogisticRegression()
    lr.fit(xtrain, ytrain)
    print('Accuracy: ', lr.score(xtest, ytest))
    return lr

In [13]:
def runDTC(xtrain, ytrain, xtest, ytest, names=None, f=30):
    dt = DecisionTreeClassifier()
    dt.fit(xtrain, ytrain)
    print('Accuracy: ', dt.score(xtest, ytest))
    return dt

In [14]:
def generate_random_rectangles(N, LOW=0, HIGH=28):
    points = []
    for i in range(N):
        # 1st point is randomly chosen
        x1 = np.random.randint(LOW, HIGH-10)
        y1 = np.random.randint(LOW, HIGH-10)
        x2 = np.random.randint(x1+5, x1+10) # to make sure that the area is big enough to compute features
        y2 = np.random.randint(y1+5, y1+10)
        points.append((x1, y1, x2, y2))
    return points

In [15]:
def compute_black_values(data, image_dim=28):
    black = [[0 for i in range(image_dim)] for j in range(image_dim)]
    # black computation dp
    black[0][0] = data[0]
    black[0][1] = data[1]
    black[1][0] = data[28]
    for i in range(image_dim):
        for j in range(image_dim):
            black[i][j] = black[i][j-1] + black[i-1][j] - black[i-1][j-1] + data[(image_dim*i) + j]
    return black

In [16]:
def extract_haar_features(rectangles, data):
    harr = [[] for i in range(data.shape[0])]
    for j in range(data.shape[0]):
        black = compute_black_values(data[j])
        
        for i in range(len(rectangles)):
            x1, y1, x2, y2 = rectangles[i]
            mx, my = (int((x2-x1)/2), int((y2-y1)/2))
            
            up_black = black[x2][y2-my] - black[x1][y2-my] - black[x2][y1] + black[x1][y1]
            lw_black = black[x2][y2] - black[x1][y2] - black[x2][y1+my] + black[x1][y1+my]
            rt_black = black[x2][y2] - black[x1+mx][y2] - black[x2][y1] + black[x1+mx][y1]
            lt_black = black[x2-mx][y2] - black[x1][y2] - black[x2-mx][y1] + black[x1][y1]
            
            harr[j].append(up_black-lw_black)
            harr[j].append(lt_black-rt_black)
    return np.array(harr)

In [19]:
mnist = fetch_mldata('MNIST original', data_home='../Data')
harr_data = extract_haar_features(generate_random_rectangles(100), mnist.data)

In [21]:
mxtrain, mxtest, mytrain, mytest = train_test_split(harr_data, mnist.target, test_size=0.33, random_state=42)

In [22]:
harr_data.shape

(70000, 200)

In [26]:
print(mxtrain.shape, mytrain.shape, mxtest.shape, mytest.shape)

(46900, 200) (46900,) (23100, 200) (23100,)


In [27]:
runLR(mxtrain, mytrain, mxtest, mytest)

Accuracy:  0.912943722943723


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
runDTC(mxtrain, mytrain, mxtest, mytest)

Accuracy:  0.8874891774891774


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')