## Import Libraries

In [1]:
import os
import gzip
import numpy as np
import copy
import random


## Dataset Loading

In [2]:
## to extract data from downloaded files

def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels
## use your file location
X_train, y_train = load_mnist('/Users/princychahal/Documents/MNIST Fashion2', kind='train')
X_test, y_test = load_mnist('/Users/princychahal/Documents/MNIST Fashion2', kind='t10k')

## Data Preprocessing

In [3]:
# data preprocessing with scaling

def data_scale(X):
    X = X/255.0
    return X

In [4]:
# data preprocessing with normalization

def data_normalize(X):
    mean = np.mean(X,axis = 0)
    std_deviation = np.std(X,axis = 0)
    X = (X-mean)/(std_deviation)
    return (X,mean,std_deviation)



## Data Augmentation

In [5]:
# # Data augmentation on training data

image = X_train.reshape(60000,28,28)
y = y_train[:,np.newaxis]
final_x = np.hstack((X_train,y))

## horizontal flip on training data
hor_image = copy.deepcopy(image)
for i in range(len(image)):
    a = hor_image[i]
    hor_image[i] = np.flip(a,axis=1)
hor_image = hor_image.reshape(60000,784)
final_hor = np.hstack((hor_image,y))


## vertical flip on training data
ver_image = copy.deepcopy(image)
for i in range(len(image)):
    a = ver_image[i]
    ver_image[i] = np.flip(a,axis=0)
ver_image = ver_image.reshape(60000,784)
final_ver = np.hstack((ver_image,y))

## copy-paste augmentation on training data
cutpaste_image = copy.deepcopy(image)
for i in range(len(image)):
    j = random.randint(0,60000)
    cutpaste_image2 = cutpaste_image[j]
    cutpaste_image[i][0:15,14:] = cutpaste_image2[0:15,14:]
cutpaste_image = cutpaste_image.reshape(60000,784)
final_cutpaste = np.hstack((cutpaste_image,y))



## combining all X_train, horizontal flip data and vertical flip data
final = np.vstack((final_x,final_hor))
final = np.vstack((final,final_ver))
np.random.shuffle(final)
final_y = final[:,-1]
final_X = final[:,:-1]
print(final_y.shape)
print(final_X.shape)


(180000,)
(180000, 784)


In [6]:
##

In [7]:
## Data augmentation on testing data

image_test = X_test.reshape(10000,28,28)
y_new = y_test[:,np.newaxis]
final_x_test = np.hstack((X_test,y_new))

## horizontal flip on testing data
hor_test_image = copy.deepcopy(image_test)
for i in range(len(image_test)):
    a = hor_test_image[i]
    hor_test_image[i] = np.flip(a,axis=1)
hor_test_image = hor_test_image.reshape(10000,784)
final_test_hor = np.hstack((hor_test_image,y_new))


## vertical flip on testing data
ver_test_image = copy.deepcopy(image_test)
for i in range(len(image_test)):
    a = ver_test_image[i]
    ver_test_image[i] = np.flip(a,axis=0)
ver_test_image = ver_test_image.reshape(10000,784)
final_test_ver = np.hstack((ver_test_image,y_new))

## copy-paste augmentation on testing data
cutpaste_test_image = copy.deepcopy(image_test)
for i in range(len(image_test)):
    j = random.randint(0,10000)
    cutpaste_test_image2 = cutpaste_test_image[j]
    cutpaste_test_image[i][0:15,14:] = cutpaste_test_image2[0:15,14:]
cutpaste_test_image = cutpaste_test_image.reshape(10000,784)
final_test_cutpaste = np.hstack((cutpaste_test_image,y))

## combining all X_test, horizontal flip and vertical flip data
test_data = np.vstack((final_x_test,final_test_hor))
test_data = np.vstack((test_data,final_test_ver))
np.random.shuffle(test_data)
y_data_test = test_data[:,-1]
X_data_test = test_data[:,:-1]
print(y_data_test.shape)
print(X_data_test.shape)

(30000,)
(30000, 784)


## Experiments


In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
## making object of classifier
model1 = RandomForestClassifier(n_estimators = 100)
model2 = RandomForestClassifier(n_estimators = 100)
model3 = RandomForestClassifier(n_estimators = 100)
model4 = RandomForestClassifier(n_estimators = 100)


Data augmentation on traing and testing data 

In [10]:

model1.fit(final_X,final_y)
y_preds = model1.predict(X_data_test)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)



0.8718


Scaling+data augmentation on traing and testing data 

In [11]:

scaled_data = data_scale(final_X)
model2.fit(scaled_data,final_y)
scaled_test_data = data_scale(X_data_test)
y_preds = model2.predict(scaled_test_data)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)


0.8709333333333333


Normalization+data augmentation on traing and testing data 

In [12]:

norm_data,mean,std_deviation = data_normalize(final_X)
model3.fit(norm_data,final_y)
norm_test_data = (X_data_test-mean)/(std_deviation)
y_preds = model3.predict(norm_test_data)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)


0.8712333333333333


Scaling+normalization+data augmentation on traing and testing data

In [13]:


scale_norm_data,norm_mean,norm_deviation = data_normalize(scaled_data)
model4.fit(scale_norm_data,final_y)
scaled_test_data = data_scale(X_data_test)
scale_norm_test_data = (scaled_test_data-norm_mean)/(norm_deviation)
y_preds = model4.predict(scale_norm_test_data)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)



0.8742666666666666


Data augmentation on testing data only

In [14]:

model5 = RandomForestClassifier(n_estimators = 100)
model5.fit(X_train,y_train)
y_preds = model5.predict(X_data_test)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)

0.6110333333333333


Normalization + Scaling + data augmentation on testing data only

In [15]:


model6 = RandomForestClassifier(n_estimators = 100)
scaled_org_data = data_scale(X_train)
scale_norm_org_data,norm_org_mean,norm_org_deviation = data_normalize(scaled_org_data)
model6.fit(scale_norm_org_data,y_train)
scaled_test_data = data_scale(X_data_test)

scale_norm_org_test_data = (scaled_test_data-norm_org_mean)/(norm_org_deviation)
y_preds = model6.predict(scale_norm_org_test_data)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)

0.6091666666666666


## Random Forest Visualization

In [None]:
from sklearn import tree
from dtreeviz.trees import *

other_image = X_train.reshape(60000, 28, 28)

model1 = RandomForestClassifier(n_estimators=1, max_depth=3)
model1.fit(final_X,final_y)
y_preds = model1.predict(X_data_test)
y_comp = y_data_test-y_preds
acc = len(y_comp)-np.count_nonzero(y_comp)
acc_percent = acc/(len(y_comp))
print(acc_percent)

viz = dtreeviz(
    model1.estimators_[0],
    final_X[:1000],
    final_y[:1000],
    class_names=[
        "T-shirt/top",
        "Trouser",
        "Pullover",
        "Dress",
        "Coat",
        "Sandal",
        "Shirt",
        "Sneaker",
        "Bag",
        "Ankle boot"
    ],
    feature_names=[f"px{i}" for i in range(28 * 28)]
)

## edit the save path
viz.save("/Users/princychahal/Documents/github/test.svg")