# PREDICTING MNIST DIGITS

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# READ THE DATA

In [None]:
#read the data

X_raw = pd.read_csv('../input/mnist-in-csv/mnist_train.csv')
X_raw.head()

In [None]:
X = X_raw.drop('label', axis = 1)
y_labels = X_raw['label']

In [None]:
X.head()

In [None]:
y_labels.head()

In [None]:
# convert to 3d numpy array
X_images = X.to_numpy().reshape(-1,28,28) 
print(X_images.shape)
print(X_images.dtype)
print(X_images.min())   
print(X_images.max())

X_images = X_images[:5,:,:]
print(X_images.shape)

#  PLOT A SAMPLE

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# pick a sample to plot
sample = 2

for row_im in X_images[sample]:
    print(row_im.tolist())
plt.imshow(X_images[sample], cmap='Greys')
plt.show()


# Change pixels below threshold to 0

In [None]:
threshold = 100

X_images[X_images <= threshold] = 0

for row_im in X_images[sample]:
    print(row_im.tolist())
plt.imshow(X_images[sample], cmap='Greys')
plt.show()


# FIND FIRST AND LAST NON-ZERO ROWS AND COLUMNS FOR CROPPING

In [None]:

#dark_rows returns a 2d array where rows are images and colums are true or false

# X_images>0 converts value to true if value>0 for each element (true equals 1 in python)
dark_rows = np.sum(X_images > 0, axis = 2) # returns sums along each row for each image 
dark_rows = dark_rows != 0      # convert to bool
print('Dark rows: ', dark_rows)    

# X_images>0 converts value to true if value>0 for each element (true equals 1 in python)
dark_cols = np.sum(X_images > 0, axis = 1) # returns sums along each col for each image 
dark_cols = dark_cols != 0      # convert to bool
print('\nDark columns: ', dark_cols)  

In [None]:

indices_rows_first = list()
indices_rows_last = list()

for row in dark_rows:
    x = np.where(row)[0][0] #np.where(bool matrix) returns a tuple with row and column indices eg.((row indices),(col indices))
                                # for non zero values                         
    indices_rows_first.append(x)    # indexing to [0][0] gives first index from rows
    
    y = np.where(row)[0][-1]          #[0][-1] gives last index from rows
    indices_rows_last.append(y)

first_dark_row_indices = np.asarray(indices_rows_first)
last_dark_row_indices = np.asarray(indices_rows_last)
    
    
    
# similar for columns     
indices_cols_first = list()
indices_cols_last = list()

for row in dark_cols:
    x = np.where(row)[0][0]                                            
    indices_cols_first.append(x)    
    
    y = np.where(row)[0][-1]
    indices_cols_last.append(y)

first_dark_col_indices = np.asarray(indices_cols_first)
last_dark_col_indices = np.asarray(indices_cols_last)


print('First dark row indices: ', first_dark_row_indices)
print('First dark column indices: ', first_dark_col_indices)
print('Last dark row indices: ', last_dark_row_indices)
print('Last dark column indices: ', last_dark_col_indices)


# CROPPING AND RESIZING

In [None]:
from skimage.transform import resize

crop_size = 20  # assuming 20x20 cropped image

# create an empty images array similar to X_images to store new images
images_resized = np.empty((X_images.shape[0], crop_size, crop_size), dtype = X_images.dtype)


for counter,img in enumerate(X_images):
        
    # resize using resize function from skimage.transform and store them to images_resized    
    images_resized[counter] = resize(img[first_dark_row_indices[counter]:(last_dark_row_indices[counter] + 1), 
                                    first_dark_col_indices[counter]:(last_dark_col_indices[counter] + 1)],
                                    (crop_size,crop_size), preserve_range = True)
    
        
print(images_resized.shape)   

# NEW IMAGE VS OLD IMAGE SAMPLE COMPARISION

In [None]:
plt.imshow(X_images[sample], cmap='Greys')
plt.show()


#plt.imshow(images_resized[sample], cmap='Greys')
#plt.show()

# apply thresholding again
images_resized[images_resized <= threshold] = 0

plt.imshow(images_resized[sample], cmap='Greys')
plt.show()

# FUNCTION FOR PROCESSING DATA

We could have just made a big function to process data for both training and evaluation
but I wanted to output what happens after each step for intuition. Lets write a function to do the processing.

In [None]:
def process_raw_images(X_raw):
    X = X_raw.drop('label', axis = 1)
    y = X_raw['label']
    
    X_images = X.to_numpy().reshape(-1,28,28) 
    
    threshold = 100
    X_images[X_images <= threshold] = 0
    
    
    dark_rows = np.sum(X_images > 0, axis = 2) 
    dark_rows = dark_rows != 0      
   
    dark_cols = np.sum(X_images > 0, axis = 1) 
    dark_cols = dark_cols != 0     
    
    
    indices_rows_first = list()
    indices_rows_last = list()

    for row in dark_rows:
        x = np.where(row)[0][0] 
        indices_rows_first.append(x)    
    
        y = np.where(row)[0][-1]          
        indices_rows_last.append(y)

    first_dark_row_indices = np.asarray(indices_rows_first)
    last_dark_row_indices = np.asarray(indices_rows_last)
      
    indices_cols_first = list()
    indices_cols_last = list()

    for row in dark_cols:
        x = np.where(row)[0][0]                                            
        indices_cols_first.append(x)    
    
        y = np.where(row)[0][-1]
        indices_cols_last.append(y)

    first_dark_col_indices = np.asarray(indices_cols_first)
    last_dark_col_indices = np.asarray(indices_cols_last)
    
    
    
    crop_size = 20
    
    images_resized = np.empty((X_images.shape[0], crop_size, crop_size), dtype = X_images.dtype)
    
    for counter,img in enumerate(X_images):  
        images_resized[counter] = resize(img[first_dark_row_indices[counter]:(last_dark_row_indices[counter] + 1), 
                                    first_dark_col_indices[counter]:(last_dark_col_indices[counter] + 1)],
                                    (crop_size,crop_size), preserve_range = True)
        
    # apply thresholding again
    images_resized[images_resized <= threshold] = 0
    
    return images_resized 

# TEST MODELS ON THE PROCESSED DATA

In [None]:
RawData = pd.read_csv('../input/mnist-in-csv/mnist_train.csv')
X_resized = process_raw_images(RawData)

y = RawData['label']

In [None]:

# flatten data from 3 dimensions to 2 dimensions
X_resized_flat = X_resized.reshape(X_resized.shape[0], -1)

# create training and validation data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_resized_flat, y, train_size=0.8, test_size=0.2, random_state=0)

# create random forest model and check accuracy on validation data
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_valid)
accuracy = np.mean(predictions == y_valid)

print(accuracy)
