# Fetching Data Using The Kaggle API

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

api = KaggleApi() 
api.authenticate()

def fetch_data():
    api.competition_download_files('digit-recognizer')
    zf = ZipFile('digit-recognizer.zip')
    zf.extractall('/Users/soosan/Documents/ML/MNIST Digit Classifier/Data/')
    zf.close()
    
fetch_data()

In [None]:
'''Uses the Kaggle API to download the datasets, unzip them and store them in the local machine'''

# Loading Training And Testing Data

In [16]:
import pandas as pd
import os

def load_data():
    csv_train_path = os.path.join('/Users/soosan/Documents/ML/MNIST Digit Classifier/Data', 'train.csv')
    csv_test_path = os.path.join('/Users/soosan/Documents/ML/MNIST Digit Classifier/Data', 'test.csv')
    return pd.read_csv(csv_train_path), pd.read_csv(csv_test_path)

In [17]:
'''Stores the path of the csv file downloaded and returns it in the form of a DataFrame'''

'Stores the path of the csv file downloaded and returns it in the form of a DataFrame'

# Data Overview

In [18]:
train_data, test_data = load_data()

print(train_data.head(), '\n')
print(train_data.shape, '\n')

print(test_data.head(), '\n')
print(test_data.shape, '\n')

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

In [19]:
'''We can see that there are 42k images. Each image can be represented by a 2D array of 28 x 28 pixels. This 2D array
   when flattened into a single dimension has 1 x 784 pixels. There's a label column that signifies the digit drawn by
   the user. There are 10 classes as there are 10 digits we have to identify.
   
   The test data has 28000 images with 784 columns signifying the pixels.'''

"We can see that there are 42k images. Each image can be represented by a 2D array of 28 x 28 pixels. This 2D array\n   when flattened into a single dimension has 1 x 784 pixels. There's a label column that signifies the digit drawn by\n   the user. There are 10 classes as there are 10 digits we have to identify.\n   \n   The test data has 28000 images with 784 columns signifying the pixels."

# Data Manipulation And Cleaning

In [20]:
def scale(train, test):
    train = train/255
    test = test/255
    return train, test

In [21]:
'''The pixel values for the images range from 0(black) to 255(white). Since MLPs(Multi Layer Perceptons) are sensitive
   to feature scaling, we will divide the train and test set with 255, so as to keep the values between 0 and 1.'''

'The pixel values for the images range from 0(black) to 255(white). Since MLPs(Multi Layer Perceptons) are sensitive\n   to feature scaling, we will divide the train and test set with 255, so as to keep the values between 0 and 1.'

# Model Training

In [22]:
'''MLPs are trained using Backpropagation, which is a supervised learning algorithm using gradient descent which 
   calculates the gradient of the loss fucntion with respect to the weight spaces of a feedforward neural network.'''

'MLPs are trained using Backpropagation, which is a supervised learning algorithm using gradient descent which \n   calculates the gradient of the loss fucntion with respect to the weight spaces of a feedforward neural network.'

In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [24]:
X = train_data.iloc[:, 1:]
y = train_data.loc[:,'label'] #Splitting the training data into two dataframes cmprising features and labels

X, test_data = scale(X, test_data) #Scaling the DataFrames as MLP is sensitive to feature scaling

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #Splitting the data into train and test sets

In [25]:
NN_Classifier = MLPClassifier(max_iter = 3000, random_state = 1, learning_rate_init = 0.1) #Initializes Classifier

In [26]:
hyperparameter_space = {'hidden_layer_sizes': [(50,50,50), (50,20,10), (397,)],
                        'activation': ['tanh', 'relu', 'logistic'],
                        'solver': ['sgd', 'adam'],
                        'alpha': [1e-4, 0.05],
                        'learning_rate': ['constant','adaptive'],
                        }

'''Defines Parameter Space for hyperparameter tuning using GridSearch CV'''

'Defines Parameter Space for hyperparameter tuning using GridSearch CV'

In [27]:
OP_NN_Classifier = GridSearchCV(NN_Classifier, hyperparameter_space, n_jobs=-1, cv=3) #Creates a tuned classifier
OP_NN_Classifier.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=MLPClassifier(learning_rate_init=0.1, max_iter=3000,
                                     random_state=1),
             n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu', 'logistic'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(50, 50, 50), (50, 20, 10),
                                                (397,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [28]:
print("Training Set Score: ", OP_NN_Classifier.score(X_train, y_train))
print("Testing Set Score: ", OP_NN_Classifier.score(X_test, y_test))

Training Set Score:  1.0
Testing Set Score:  0.9798809523809524


# Predicting On Test Data

In [36]:
predictions = OP_NN_Classifier.predict(test_data)
df = pd.DataFrame(predictions)
df.index = range(1, len(df) + 1)
df.to_csv('/Users/soosan/Documents/ML/MNIST Digit Classifier/Data/predictions.csv')