In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
df_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [None]:
df_train

In [None]:
df_train.label.unique()

In [None]:
from matplotlib.pyplot import imshow

width=5
height=5
rows = 2
cols = 3
axes=[]

fig=plt.figure()
fig.set_size_inches(8,10)
for i in range(rows*cols):
    sample=np.reshape(df_train[df_train.columns[1:]].iloc[i].values/255,(28,28))
    axes.append(fig.add_subplot(rows,cols,i+1))
    plt.title("Labeled class : {}".format(df_train["label"].iloc[i]))
    plt.imshow(sample, 'gray')
fig.tight_layout()
plt.show()

# Explanatory Data Analysis

In [None]:
plt.figure(figsize=(8,6))
ax = sns.countplot(x='label',data=df_train)

plt.title("Label Distribution")
total= len(df_train.label)
for p in ax.patches:
    percentage = f'{100 * p.get_height() / total:.1f}%\n'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='center')

In [None]:
df_train.describe()

In [None]:
df_train.sum(axis=1)

In [None]:
df_train.shape

In [None]:
#features engineering ad sum of pixcel values
pixels = df_train.columns.tolist()[1:]
df_train["sum"] = df_train[pixels].sum(axis=1)

df_test["sum"] = df_test[pixels].sum(axis=1)

In [None]:
df_train.groupby(['label'])['sum'].mean()

In [None]:
len(df_train)

In [None]:
#make count of zero values feature

#train = np.count_nonzero(df_train,axis=1)
#test = np.count_nonzero(df_test,axis=1)

#df_train['count_of_zero_percents'] = (784 - train)
#df_test['count_of_zero_percents'] = (784 - test)

#did not help the performance



In [None]:
# separate target values from df_train
targets = df_train.label
features = df_train.drop("label",axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features[:] = scaler.fit_transform(features)
df_test[:] = scaler.transform(df_test)

In [None]:
del df_train

In [None]:
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(features)

In [None]:
Y_sklearn

In [None]:
#referred to https://sebastianraschka.com/Articles/2015_pca_in_3_steps.html and  https://www.kaggle.com/arthurtok/interactive-intro-to-dimensionality-reduction


with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(10, 8))
    for lab, col in zip((0,1,2,3,4,5,6,7,8,9),
                       ('blue','red','green','yellow','purple','black','brown','pink','orange','beige')):
        plt.scatter(Y_sklearn[targets==lab, 0],
                    Y_sklearn[targets==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

In [None]:
features.index

In [None]:
sklearn_pca_3 = sklearnPCA(n_components=3)
Y_sklearn_3 = sklearn_pca_3.fit_transform(features)
Y_sklearn_3_test = sklearn_pca_3.transform(df_test)

In [None]:
# Store results of PCA in a data frame
result=pd.DataFrame(Y_sklearn_3, columns=['PCA%i' % i for i in range(3)], index=features.index)

In [None]:
result

In [None]:
my_dpi=96
plt.figure(figsize=(480/my_dpi, 480/my_dpi), dpi=my_dpi)

with plt.style.context('seaborn-whitegrid'):
    my_dpi=96
    fig = plt.figure(figsize=(10, 10), dpi=my_dpi)
    ax = fig.add_subplot(111,projection ='3d')
    for lab, col in zip((0,1,2,3,4,5,6,7,8,9),
                       ('blue','red','green','yellow','purple','black','brown','pink','orange','beige')):
        plt.scatter(Y_sklearn[targets==lab, 0],
                    Y_sklearn[targets==lab, 1],
                    label=lab,
                    c=col,s =60)                
        
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.set_zlabel('Principal Component 3')
    ax.set_title("PCA on the Handwriting Data")
    plt.show()

In [None]:
encoder = LabelEncoder()
targets[:] = encoder.fit_transform(targets[:])

In [None]:
X_train,X_val, y_train,y_val = train_test_split(result,targets,random_state=1)

# Making a Model and Predictions

In [None]:
# 3 Principal Components
model = XGBClassifier(max_depth=5, objective='multi:softprob', n_estimators=1000, 
                        num_classes=10)

history = model.fit(X_train, y_train,eval_set =[(X_val,y_val)],early_stopping_rounds =50)
acc = accuracy_score(y_val, model.predict(X_val))
print(f"Accuracy: , {round(acc,3)}")







In [None]:
X_train,X_val, y_train,y_val = train_test_split(features,targets,random_state=1)


In [None]:

model = XGBClassifier(max_depth=5, objective='multi:softprob', n_estimators=1000, 
                        num_classes=10)

history = model.fit(X_train, y_train,eval_set =[(X_train,y_train),(X_val,y_val)],early_stopping_rounds =5)
acc = accuracy_score(y_val, model.predict(X_val))
print(f"Accuracy: , {round(acc,3)}")


#0.973 with two feature engineering

In [None]:
results = model.evals_result()

In [None]:
from matplotlib import pyplot
# plot learning curves
plt.figure(figsize=(10, 8))
pyplot.plot(results['validation_0']['mlogloss'], label='train')
pyplot.plot(results['validation_1']['mlogloss'], label='test')
# show the legend
pyplot.legend()
plt.xlabel('iterations')
plt.ylabel('mlogloss')
# show the plot
pyplot.show()

In [None]:
from xgboost import plot_importance
ax = plot_importance(model,max_num_features=10)
fig = ax.figure
fig.set_size_inches(10,8)
plt.show()

# With Cross Validation

In [None]:
import gc
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

preds = []

kf = StratifiedKFold(n_splits=4,random_state=48,shuffle=True)
acc =[]
n=0 

for trn_idx, test_idx in kf.split(features, targets):
    X_tr, X_val = features.iloc[trn_idx], features.iloc[test_idx]
    y_tr,y_val= targets.iloc[trn_idx] , targets.iloc[test_idx]
    
    model = XGBClassifier(max_depth=10, objective='multi:softprob', n_estimators=1000, 
                        num_classes=10)
    model.fit(X_tr,y_tr,eval_set = [(X_val,y_val)],early_stopping_rounds =100,verbose =False)
    
    preds.append(model.predict(df_test))
    acc.append(accuracy_score(y_val,model.predict(X_val)))
    
    
    print(f"fold: {n+1} , accuracy: {round(acc[n]*100,3)}")
    n+=1
    
    del X_tr,X_val,y_tr,y_val
    gc.collect()

In [None]:
print(f"the mean Accuracy is : {round(np.mean(acc)*100,3)} ")

In [None]:
from scipy import stats
predictions = stats.mode(preds)[0][0]


In [None]:
predictions

In [None]:
predictions = encoder.inverse_transform(predictions)

In [None]:
output = pd.read_csv("../input/digit-recognizer/sample_submission.csv")
output['Label'] = predictions
output.to_csv('submission.csv',index=False)