# Digit recognizer for kaggle
https://www.kaggle.com/competitions/digit-recognizer/data

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import cv2
import os
import pandas as pd
from math import ceil
from tensorflow.keras.utils import to_categorical

In [None]:
train=pd.read_csv("../input/digit-recognizer/train.csv")
test=pd.read_csv("../input/digit-recognizer/test.csv")

# Import and basic info

In [None]:
print("train shape info {}".format(train.shape))
print("test shape info {}".format(test.shape))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
sorted(train["label"].unique())

**Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).**

We have to parse the dataset into readables images

In [None]:
first_image=list(train.iloc[0,:])
len(first_image)

In [None]:

result= np.array([[0 for x in range(28)] for y in range(28)])

for x in range(0,784):
    result[x//28][x%28]=first_image[x]
    

In [None]:
plt.imshow(result,cmap='gray')

In [None]:
fig=plt.figure(figsize=(10,10))
for i in range(0,10):
    first_image=list(train.iloc[i,:])
    result= np.array([[0 for x in range(28)] for y in range(28)])

    for x in range(0,784):
        result[x//28][x%28]=first_image[x]
    
    plt.subplot(3, 4, i+1)
    plt.imshow(result,cmap='gray')
    
plt.show()
  
    

# Outliers Detection

In [None]:
from sklearn.ensemble import IsolationForest


def detect_outliers(dataframe,contamination):
    

    a=list(train.select_dtypes(['int64']).columns)+list(train.select_dtypes(['float64']).columns)
    model=IsolationForest(contamination=contamination)
    model.fit(dataframe[a])
    outliers = model.predict(dataframe[a]) ==-1
    return outliers

index=detect_outliers(train,0.001)


In [None]:
train[index].iloc[0,:]

## Visualizing one outlier

In [None]:
first_image=list(train[index].iloc[0,:])
result= np.array([[0 for x in range(28)] for y in range(28)])

for x in range(0,784):
    result[x//28][x%28]=first_image[x]
    
plt.imshow(result,cmap='gray')

**we can see this image is poorly written**

# Parsing dataframes

In [None]:

X_train=[]
X_test=[]
for i in range(42000):
    temp_image=list(train.iloc[i,1:])
    result= np.array([[0 for x in range(28)] for y in range(28)])

    for x in range(0,784):
        result[x//28][x%28]=temp_image[x]
    X_train.append(result)
        
    
X_train=np.array(X_train)

for i in range(28000):
    temp_image=list(test.iloc[i,:])
    result= np.array([[0 for x in range(28)] for y in range(28)])

    for x in range(0,784):
        result[x//28][x%28]=temp_image[x]
    X_test.append(result)
    
X_test=np.array(X_test)




In [None]:
X_train=X_train.astype('float32')
X_test=X_test.astype('float32')

X_train = X_train/ 255.0
X_test = X_test / 255.0

In [None]:
y_train=to_categorical(train['label'])
y_train.shape

In [None]:
X_train=X_train.reshape((X_train.shape[0], 28, 28, 1))
X_test=X_test.reshape((X_test.shape[0], 28, 28, 1))


print(X_train.shape)
print(X_test.shape)


# Tensorflow model

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.optimizers import SGD

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(10, activation='softmax'))
# compile model
opt = SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])






In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32)