In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import time

In [None]:
train = pd.read_csv('../input/digit-recognizer/train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

The training dataset contains 784 columns, which denote the 28 * 28 pixel values of each number from 0 to 9. The "label" column is our target column. The pixel values populated from pixel0 to pixel783 contain integer values from 0 to 255, inclusive.

In [None]:
train.label.value_counts()

The values from 1 to 9 are almost uniformly distributed

Separating the label values and column values in different datasets before commencing with data processing

In [None]:
y=train["label"]
X=train.loc[:, train.columns != "label"]

Before running any algorithm, we will scale the dataset

In [None]:
X_values = X.values
X_std = StandardScaler().fit_transform(X_values)

Breaking the data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.25, random_state = 42, stratify = y)

In [None]:
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

We will run the algorithm on the data without dimensionality reduction and then check the efficacy with a dimensionality reduction technique 

**Logictic Regression**

In [None]:
import time
log  = LogisticRegression(random_state = 42, multi_class="multinomial", solver="saga", max_iter=200)
start_time = time.time()
log.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print("Time elapsed: ",time1)
y_pred = log.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(log.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(log.score(X_test, y_test), decimals = 3))

# Classification Report
logistic_report = classification_report(y_test, y_pred)
print(logistic_report)

> It took about 269 seconds to fit the data with 784 columns. 
Overall accuracy achieved is 92%.
However the model is slightly overfitting as the training accuracy is more than test accuracy.

**Random Forest Classifier**

In [None]:
rfc = RandomForestClassifier(n_estimators = 300, max_depth = 5, random_state = 42, n_jobs = -1)
start_time = time.time()
rfc.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print("Time elapsed: ",time1)
y_pred = rfc.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(rfc.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(rfc.score(X_test, y_test), decimals = 3))

# Classification Report
random_forest_report = classification_report(y_test, y_pred)
print(random_forest_report)

Random Forest Classifier executed in about 10 seconds.
However the accuracy is less than that of Logistic Regression
We might need to tune parameters to achieve a better performance

**XGBClassifier**

In [None]:
clf = XGBClassifier(max_depth = 5, n_jobs = -1, objective='multi:softmax',num_class=10, eval_metric="mlogloss", random_state = 42)
start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print(time1)
y_pred = clf.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(clf.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(clf.score(X_test, y_test), decimals = 3))

# Classification Report
XGB_report = classification_report(y_test, y_pred)
print(XGB_report)

XGB Classifier executed in 899 seconds. Again the model is overfitting, as we achieved 100% accuracy in the training set, but 97.3 in the test set. 

We will now decrease the number of columns using Principal Component Analysis (PCA) and check performance and accuracy

PCA is performed on a small subset of data so that we do not end up spending a lot of time on the analysis. Here we will fit the data on X_pca,which is just 30% of the entire dataset.

In [None]:
# train test split: PCA data and non PCA data
X_data, X_pca = train_test_split(X_std, test_size=0.3, random_state=1)

In [None]:
X_pca

In [None]:
X_pca=pd.DataFrame(X_pca)

In [None]:
X_pca

Here we will aim to explain 98% of the variance with PCA. We could reduce or increase it as per the needs of our project

In [None]:
pca = PCA(0.98).fit(X_pca)

We will draw an elbow plot to check the optimal number of features that can explain 98% of the variance in data

In [None]:
var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
plt.ylabel('% Variance Explained')
plt.xlabel('Number of Features')
plt.title('PCA Analysis')
plt.ylim(30,100.5)
plt.style.context('seaborn-whitegrid')
plt.plot(var)

In [None]:
print('%d components explain 98%% of the variation in data' % pca.n_components_)

Just a little more than half of all 784 features can explain the intended variance

In [None]:
pca = PCA(n_components=406, random_state = 0)
pca.fit(X_pca)
X_pca_t = pca.transform(X_pca)
print(X_pca_t.shape)

In [None]:
X_std_t = pca.transform(X_std)
print(X_std_t.shape)

The actual dataset X_std is PCA transformed

In [None]:
X_std_t = pd.DataFrame(data = X_std_t)

In [None]:
X_std_t

We wil now repeat the steps from train_test_split to running the machine learning algorithms using the reduced dataset

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_std_t, y, test_size=0.25, random_state=42, stratify = y) 

In [None]:
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

**Logistic Regression on PCA reduced dataset**

In [None]:
import time
log  = LogisticRegression(random_state = 42, multi_class="multinomial", solver="saga", max_iter=200)
start_time = time.time()
log.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print("Time elapsed: ",time1)
y_pred = log.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(log.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(log.score(X_test, y_test), decimals = 3))

# Classification Report
logistic_report = classification_report(y_test, y_pred)
print(logistic_report)

**Random Forest Classifier on PCA reduced dataset**

In [None]:
rfc = RandomForestClassifier(n_estimators = 300, max_depth = 5, random_state = 42, n_jobs = -1)
start_time = time.time()
rfc.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print("Time elapsed: ",time1)
y_pred = rfc.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(rfc.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(rfc.score(X_test, y_test), decimals = 3))

# Classification Report
random_forest_report = classification_report(y_test, y_pred)
print(random_forest_report)

**XGB Classifier on PCA reduced dataset**

In [None]:
clf = XGBClassifier(max_depth = 5, n_jobs = -1, objective='multi:softmax',num_class=10, eval_metric="mlogloss", random_state = 42)
start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()
time1 = end_time-start_time
print(time1)
y_pred = clf.predict(X_test)

# Accuracy Estimation
print('Accuracy Score (Train Data):', np.round(clf.score(X_train, y_train), decimals = 3))
print('Accuracy Score (Test Data):', np.round(clf.score(X_test, y_test), decimals = 3))

# Classification Report
XGB_report = classification_report(y_test, y_pred)
print(XGB_report)