### Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [None]:
pd.set_option("display.max_columns",999)

<br>

### Reading and Understanding Data

In [None]:
df_train = pd.read_csv("../input/digit-recognizer/train.csv")
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test = pd.read_csv("../input/digit-recognizer/test.csv")
df_test.head()

In [None]:
### checking null 
df_train.isnull().sum().sort_values(ascending = False)

In [None]:
df_test.isnull().sum().sort_values(ascending = False)

In [None]:
### checking digit values
df_train.label.value_counts()

<br>

### Model Building

In [None]:
y_train = df_train.pop("label")
x_train = df_train

In [None]:
### scaling the data
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
x_train_scaled = scaler.fit_transform(x_train)

In [None]:
df_test_scaled = scaler.transform(df_test)

In [None]:
### using PCA for dimensionality reduction
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import PCA

In [None]:
pca = PCA(random_state=42)

In [None]:
### drawing Scree plot to check optinal number of PC required to represent the data
pca.fit(x_train_scaled)

In [None]:
var_cumu = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1,len(pca.explained_variance_ratio_)+1) , var_cumu)
plt.show()

#### Using Logistic Regression to predict

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
### creating function to get the optimal PC and checking the accuracy of the model
def optimalModel(components):
    incPCA = IncrementalPCA(n_components=components)
    x_train_pc = pd.DataFrame(incPCA.fit_transform(x_train_scaled))
    lr = LogisticRegression()
    score = cross_val_score(lr , x_train_pc , y_train , cv=5,n_jobs=-1).mean()
    return score

In [None]:
pc_score = []
for i in range(200,500,25):
    pc_score.append(optimalModel(i))

In [None]:
fig = plt.figure(figsize=(12,10))
plt.plot(range(200,500,25) , pc_score)
plt.show()

In [None]:
### final Model
incPCA = IncrementalPCA(n_components=225)
x_train_pc = pd.DataFrame(incPCA.fit_transform(x_train_scaled))
lr = LogisticRegression()
model = lr.fit(x_train_pc , y_train)

<br>

### Predict the Test Value

In [None]:
x_test_pc = pd.DataFrame(incPCA.transform(df_test_scaled))

In [None]:
df_test["Label"] = model.predict(x_test_pc)

In [None]:
df_test["Label"].shape

In [None]:
df_test.index += 1

In [None]:
df_test["Label"].to_csv("prediction.csv")