# PACKAGES AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import scale
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve

#### WARNINGS 

In [None]:
filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning)

# DATA

In [None]:
DigitsTrain = pd.read_csv("../input/digit-recognizer/train.csv") # train main

In [None]:
DigitsTest = pd.read_csv("../input/digit-recognizer/test.csv") # test main

In [None]:
data = DigitsTrain.copy() # to protect train main data

In [None]:
dataTest = DigitsTest.copy() # to protect test main data

# EXPLORATORY DATA ANALYSIS

#### TRAIN DATA

In [None]:
print(data.head())

In [None]:
print(data.shape)

In [None]:
print(data.columns)

In [None]:
print(data.columns.value_counts().sum())

In [None]:
print(data.info())

In [None]:
print(data.describe().T)

In [None]:
print(data.index)

In [None]:
print(type(data)) # checking type

In [None]:
print(data.corr()) # checking correlation
# it is not suitable for this data, but you should use and try for others

In [None]:
print(data.cov()) # checking covariance
# it is not suitable for this data, but you should use and try for others

In [None]:
print(data.groupby(["label"])["pixel0"].mean()) # example for groupby
# it is not suitable for this data, but you should use and try for others

In [None]:
print(data.groupby(["label"])["pixel783"].mean()) # example for groupby
# it is not suitable for this data, but you should use and try for others

In [None]:
maxPixel783 = data[data["pixel783"] == data["pixel783"].max()] # example for checking max
# it is not suitable for this data, but you should use and try for others
print(maxPixel783.index.value_counts())

In [None]:
print(data.isnull().all().sum()) # checking all missing values for rows and columns

In [None]:
print(data.isnull().any().sum()) # checking any missing values for rows and columns

In [None]:
print(data.duplicated().sum()) # checking duplicated

#### TEST DATA

In [None]:
print(dataTest.head())

In [None]:
print(dataTest.shape)

In [None]:
print(dataTest.columns)

In [None]:
print(dataTest.info())

In [None]:
print(dataTest.describe().T)

In [None]:
print(dataTest.corr())

In [None]:
print(dataTest.cov())

In [None]:
print(dataTest.columns.value_counts().sum())

In [None]:
print(dataTest.isnull().all().sum())

In [None]:
print(dataTest.isnull().any().sum())

In [None]:
print(dataTest.duplicated().sum())

# SPLITTING TRAIN AND TEST FOR TRAINING

In [None]:
x = data.drop("label",axis=1) # features
y = data["label"] # prediction target

In [None]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state=42) # for testing our models

In [None]:
scaler = MinMaxScaler()

In [None]:
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.fit_transform(xTest)
# for faster process
# you can also use StandartScaler

In [None]:
print(xTrain.shape)

In [None]:
print(xTest.shape)

# VISUALIZATION

In [None]:
figure = plt.figure(figsize=(15,8))
sns.countplot(data["label"],color="black")
# we see how many numbers are in the data
plt.show()

In [None]:
for i in range(0,5):
    image = x.iloc[i]
    image = np.array(image,dtype='uint8')
    # we need to transform it to numpy array for visualizing
    image = image.reshape((28,28))
    # we need to reshape for visualizing because of pixels
    plt.imshow(image,cmap="gray")
    plt.axis("off")
    plt.show()
# visualizing numbers in data by using pixel values

# MODELS

#### CLASSIFIER

In [None]:
# models training
lj = LogisticRegression(solver="liblinear").fit(xTrain,yTrain)
gnb = GaussianNB().fit(xTrain,yTrain)
knnc = KNeighborsClassifier().fit(xTrain,yTrain)
cartc = DecisionTreeClassifier(random_state=42).fit(xTrain,yTrain)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(xTrain,yTrain)

In [None]:
modelsc = [lj,gnb,knnc,cartc,rfc]
# we will try 5 models for example
# if you want you can try all of them

In [None]:
# models' accuracy scores
for model in modelsc:
    name = model.__class__.__name__
    predict = model.predict(xTest)
    R2CV = cross_val_score(model,xTest,yTest,cv=10,verbose=False).mean()
    error = -cross_val_score(model,xTest,yTest,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    print(name + ": ")
    print("-" * 10)
    print("ACC-->",accuracy_score(yTest,predict))
    print("R2CV-->",R2CV)
    print("MEAN SQUARED ERROR-->",np.sqrt(error))
    print("-" * 30)

as we see, the best is Random Forest for each of 5 models

#### PREDICTION

In [None]:
testlabel = pd.read_csv("../input/digit-recognizer/test.csv")

 we use "dataTest" for ANN, that is why we need to define again

In [None]:
predictC = rfc.predict(testlabel)

In [None]:
print(predictC[0])

In [None]:
# checking same value
imageT = testlabel.iloc[0]
imageT = np.array(imageT,dtype='uint8')
# we need to transform it to numpy array for visualizing
imageT = imageT.reshape((28,28))
# we need to reshape for visualizing because of pixels
plt.imshow(imageT,cmap="gray")
plt.axis("off")
plt.show()

IT'S TRUE

#### SUBMISSION

In [None]:
print(predictC.shape)

In [None]:
imageIDC = [h+1 for h in range(len(predictClass))]

In [None]:
print(imageIDC)

In [None]:
submissionClass = pd.DataFrame()
submissionClass["ImageId"] = imageIDC
submissionClass["Label"] = predictC

In [None]:
submissionC.to_csv('submissionClass.csv', index=False)

# ANN MODEL

#### LIBRARY 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
import tensorflow as tf

#### TRAIN

In [None]:
xN = data.drop("label",axis=1)
yN = data["label"]

In [None]:
xN = np.array(xN)
xN = xN.reshape(42000, 28, 28)

In [None]:
print(xN.shape)

#### CREATING

In [None]:
ANNmodel = tf.keras.models.Sequential([
  # inputs
  tf.keras.layers.experimental.preprocessing.Rescaling(1./255),
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  # hiddens
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  # output
  tf.keras.layers.Dense(10)
])

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ANNmodel.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

MainModel = ANNmodel.fit(xN, yN, validation_split=0.2, epochs=30, callbacks=[callback])

In [None]:
print(ANNmodel.summary())

In [None]:
loss = pd.DataFrame(ANNmodel.history.history)

In [None]:
print(loss.head())

In [None]:
loss.plot()

In [None]:
plt.plot(MainModel.history["accuracy"])
plt.plot(MainModel.history["val_accuracy"])
plt.xlabel("EPOCH")
plt.ylabel("ACC")
plt.legend()
plt.show()

#### PREDICTION

In [None]:
dataTest = np.array(dataTest)

In [None]:
print(dataTest.shape)

In [None]:
dataTest = dataTest.reshape(28000,28,28)

In [None]:
predictANN = ANNmodel.predict(dataTest)

In [None]:
plt.imshow(dataTest[0])
plt.show()

In [None]:
p = [np.argmax(i) for i in predictANN]

In [None]:
# checking same value
print(p[0])

IT'S TRUE

#### SUBMISSION

In [None]:
imageID = [x+1 for x in range(len(p))]

In [None]:
submission = pd.DataFrame()
submission["ImageId"] = imageID
submission["Label"] = p

In [None]:
submission.to_csv('submissionANN.csv', index=False)