In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Dataset

In [None]:
mnist_df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
mnist_df.head()

In [None]:
mnist_df.shape

It is given that pixelx is located on row i and column j of a 28 x 28 matrix
So, we get 28 x 28 = 784 after excluding labels (actual number values)

# Constructing a digit from pixel values

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
for val in (10, 25, 1000, 2500):
    sample_digit = mnist_df.iloc[val][1:] #taking all pixel values
    sample_digit_mat = sample_digit.values.reshape(28,28)
    plt.imshow(sample_digit_mat, cmap="binary")
    plt.show()

In [None]:
# to confirm the values
for val in (10, 25, 1000, 2500):
    print( mnist_df.iloc[val][0])

They are equal so everything seems almost fine and we are good to go with our EDA and learning models.

# EDA

Checking missing or null values

In [None]:
mnist_df.isnull().any().describe()

There is no null or missing value present in the training dataset

In [None]:
import seaborn as sns
plt.ioff()
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=mnist_df, x="label")

We can visualize that dataset has similar count of values for all digits from 0 to 9

# Splitting of Training and Validation datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = mnist_df.drop(["label"], axis=1)
y = mnist_df["label"]

In [None]:
#Verify it with its shape
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classification Models

1. SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42, n_jobs=-1)
sgd_clf_log = SGDClassifier(random_state = 42, n_jobs=-1, loss="log")

In [None]:
sgd_clf.fit(X_train, y_train)
sgd_clf_log.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred_sgd_clf = sgd_clf.predict(X_test)
y_pred_sgd_clf_log = sgd_clf_log.predict(X_test)

print(accuracy_score(y_test, y_pred_sgd_clf))
print(accuracy_score(y_test, y_pred_sgd_clf_log))

Confusion Matrix to look into false positive and false negative predictions

In [None]:
from sklearn.metrics import confusion_matrix

y_test_pred_sgd = sgd_clf.predict(X_test)
confusion_matrix(y_test, y_test_pred_sgd)

From the confusion matrix we can see a lot of digits as false positives and negatives

Prediction and Recall scores

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_test_pred_sgd, average="weighted"))
print(recall_score(y_test, y_test_pred_sgd, average="weighted"))
print(f1_score(y_test, y_test_pred_sgd, average="weighted"))

Let us try with some better classification model with higher accuracy and f1 score

2. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500, max_leaf_nodes=30, random_state=42, n_jobs=-1, verbose=1)
rfc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_test_pred_rfc = rfc.predict(X_test)
accuracy_score(y_test, y_test_pred_rfc)

3. K-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

**GridSearchCV to try out KNN at different hyperparameters**

In [None]:
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [5, 8, 13]}]

knn_clf = KNeighborsClassifier(n_jobs=-1)

grid_search = GridSearchCV(knn_clf, param_grid, cv=3, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

Let us try to increase accuracy of knn by hyperparameter tuning of k value

In [None]:
from sklearn.model_selection import cross_val_score

accuracies = []
for k in np.arange(4,13):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=3, n_jobs=-1, verbose=3)
    accuracies.append(scores.mean())
    
print(accuracies)

In [None]:
plt.plot(np.arange(4,13), accuracies)
plt.xlabel('k in kNN')
plt.ylabel('Accuracy')
plt.show()

Visualizng accuracy on different k values in KNN, we can see it stays same from k 3 to 6 which is around 96.3% and then it drops sharply for higher k

4. SVM Classification

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_train)

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
#print(cumsum)
d = np.argmax(cumsum >= 0.95) + 1 # to maintain 95% variance
print(d)

In [None]:
#better way is to specify the variance we need to preserve
pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)

In [None]:
X_train_reduced.shape

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="rbf", random_state=42, verbose=3)
svm_clf.fit(X_train, y_train)

In [None]:
y_test_pred_svm = svm_clf.predict(X_test)
accuracy_score(y_test, y_test_pred_svm)

**GridSearchCV to try out SVC at different hyperparameters of C**

In [None]:
params_grid_svm = {'C':[0.1, 1, 10, 100]}

In [None]:
grid_search_svm = GridSearchCV(SVC(kernel="rbf", random_state=42), param_grid=params_grid_svm, verbose=3)
grid_search_svm.fit(X_train, y_train)

In [None]:
print(grid_search_svm.best_params_)
print(grid_search_svm.best_score_)

# Building an ensemble with the above classifiers

In [None]:
from sklearn.ensemble import VotingClassifier

sgd_clf = SGDClassifier(random_state = 42)
rf_clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, verbose=1)
knn_clf = KNeighborsClassifier(n_neighbors=5, weights="distance")
svm_clf = SVC(kernel="rbf", C=10, probability=True, random_state=42)

voting_clf = VotingClassifier(estimators=[("sgd", sgd_clf), ("rfc", rf_clf), ("knn", knn_clf), ("svc", svm_clf)])
voting_clf.fit(X_train, y_train)

In [None]:
for clf in (sgd_clf, rf_clf, knn_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
voting_clf = VotingClassifier(estimators=[("rfc", rf_clf), ("knn", knn_clf), ("svc", svm_clf)], voting="soft")
voting_clf.fit(X_train, y_train)

In [None]:
y_pred_votclf = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred_votclf)

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="rbf", C=10, random_state=42, verbose=3)
svm_clf.fit(X_train_reduced, y_train)

In [None]:
svm_clf.score(pca.transform(X_test), y_test)

In [None]:
pca = PCA(n_components=0.95)
#X_train_reduced = pca.fit_transform(X_train)

In [None]:
svm_clf.fit(pca.fit_transform(X), y)

In [None]:
val = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
pred = svm_clf.predict(pca.transform(val))
# ImageId,Label

val['Label'] = pd.Series(pred)
val['ImageId'] = val.index +1
sub = val[['ImageId','Label']]

In [None]:
sub.to_csv('submission_updated1.csv', index=False)

# Trying to go beyond 98.5% - 99% accuracy with CNN

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
X_train_scaled = X_train/255
X_test_scaled = X_test/255

In [None]:
#reshaping for data augmentation
X_train_scaled = X_train_scaled.values.reshape(-1, 28, 28, 1)
X_test_scaled = X_test_scaled.values.reshape(-1, 28, 28, 1)

In [None]:
datagen = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1)

In [None]:
datagen.fit(X_train_scaled) # we do data augmentation of training images, leaving the test images untouched

In [None]:
training_set = datagen.flow(X_train_scaled, y_train)

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense, Dropout

model = Sequential()

model.add(Conv2D(32, kernel_size=5, input_shape=(28, 28, 1), activation="relu"))
model.add(Conv2D(32, kernel_size=5, activation="relu"))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.4))

model.add(Conv2D(64, kernel_size=3, activation="relu"))
model.add(Conv2D(64, kernel_size=3, activation="relu"))
model.add(MaxPooling2D(pool_size=2, strides=2))
model.add(Dropout(0.4))

model.add(Flatten())
model.add(Dense(300, activation = "relu"))
model.add(Dropout(0.4))
model.add(Dense(10, activation = "softmax"))

In [None]:
# now compiling and training my CNN
# we mention our sparse_categorical as it works on integers
model.compile(optimizer=keras.optimizers.Adam(learning_rate = 0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
#callbacks_es = keras.callbacks.EarlyStopping(patience=10)
callbacks_cp = keras.callbacks.ModelCheckpoint(filepath="/kaggle/working/mykeras_model.h5", save_best_only=True)

In [None]:
history = model.fit(training_set, validation_data = (X_test_scaled, y_test), epochs=30, verbose=2, callbacks=[callbacks_cp])

In [None]:
val = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
val_scaled = val/255
val_scaled = val_scaled.values.reshape(-1, 28, 28, 1)
val_scaled.shape

Now pred has probability for each class, but we need to consider the max one

In [None]:
pred = model.predict(val_scaled)

result = np.argmax(pred, axis=1)
result1 = pd.Series(result, name="Label")
#result['ImageId'] = val.index +1
sub = pd.concat([pd.Series(range(1, 28001), name="ImageId"), result1], axis=1)

In [None]:
sub.to_csv('submission_updated4.csv', index=False)

In [None]:
# trying to tune more hyperparameters using keras tuner

We have used Keras Tuner to find out the best hyperparameters for our CNN. It found out 300 units in the Dense layer and learning rate of Adam optimizer = 0.001

In [None]:
!pip install -q -U keras-tuner

In [None]:
import kerastuner as kt

In [None]:
def model_builder(hp):
    model = keras.models.Sequential()

    model.add(Conv2D(32, kernel_size=5, input_shape=(28, 28, 1), activation="relu"))
    model.add(Conv2D(32, kernel_size=5, activation="relu"))
    model.add(MaxPooling2D(pool_size=2))

    model.add(Conv2D(64, kernel_size=3, activation="relu"))
    model.add(Conv2D(64, kernel_size=3, activation="relu"))
    model.add(MaxPooling2D(pool_size=2, strides=2))

    model.add(Flatten())
    hp_units = hp.Int('units', min_value=150, max_value=300, step=50)
    model.add(Dense(units = hp_units, activation = "relu"))
    model.add(Dense(10, activation = "softmax"))

    hp_lr = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])
    my_optim = keras.optimizers.Adam(learning_rate = hp_lr)
    model.compile(optimizer=my_optim, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(training_set, validation_data = (X_test_scaled, y_test), epochs=30, verbose=2, callbacks=[stop_early])

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
print(best_hps.get('learning_rate'))

In [None]:
print(best_hps.get('units'))