# [Kaggle Digit Recognizer Challenge](https://www.kaggle.com/competitions/digit-recognizer)
From https://www.datacamp.com/tutorial/xgboost-in-python

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

mnist_train = pd.read_csv("/home/rainer/Downloads/ML_datasets/mnist_train.csv")

mnist_test = pd.read_csv("/home/rainer/Downloads/ML_datasets/mnist_test.csv")

X, Y = mnist_train.drop('label', axis=1), mnist_train[['label']]

In [2]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
dtrain_reg = xgb.DMatrix(X_train, Y_train, enable_categorical=False)
dtest_reg = xgb.DMatrix(X_test, Y_test, enable_categorical=False)
dfull = xgb.DMatrix(X, Y, enable_categorical=False)

In [3]:
# Define hyperparameters
params = {
    "objective": "multi:softmax",
    "tree_method": "gpu_hist",
    "num_class": Y['label'].nunique()
}
n=100
results = xgb.cv(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    nfold=5,
    verbose_eval=10, # print eval every xth round
    metrics=["mlogloss", "merror"],
)

[0]	train-mlogloss:1.34988+0.00284	train-merror:0.12560+0.00192	test-mlogloss:1.38872+0.00740	test-merror:0.15978+0.00319
[10]	train-mlogloss:0.18988+0.00238	train-merror:0.02316+0.00076	test-mlogloss:0.29746+0.00994	test-merror:0.06384+0.00377
[20]	train-mlogloss:0.05867+0.00187	train-merror:0.00571+0.00049	test-mlogloss:0.17309+0.00855	test-merror:0.04756+0.00286
[30]	train-mlogloss:0.02317+0.00080	train-merror:0.00087+0.00016	test-mlogloss:0.13590+0.00937	test-merror:0.03987+0.00280
[40]	train-mlogloss:0.01095+0.00041	train-merror:0.00003+0.00003	test-mlogloss:0.12098+0.00970	test-merror:0.03622+0.00302
[50]	train-mlogloss:0.00604+0.00016	train-merror:0.00000+0.00000	test-mlogloss:0.11298+0.01050	test-merror:0.03438+0.00285
[60]	train-mlogloss:0.00380+0.00010	train-merror:0.00000+0.00000	test-mlogloss:0.10842+0.01082	test-merror:0.03298+0.00296
[70]	train-mlogloss:0.00268+0.00007	train-merror:0.00000+0.00000	test-mlogloss:0.10590+0.01113	test-merror:0.03241+0.00262
[80]	train-mloglo

In [4]:
model = xgb.train(
    params=params,
    dtrain=dfull,
    num_boost_round=100
)
final_test = xgb.DMatrix(mnist_test, enable_categorical=False)
predictions = model.predict(final_test)

In [5]:
predictionsDf = pd.DataFrame(predictions, columns=["Label"])
predictionsDf.insert(0, 'ImageId', range(1, len(predictionsDf) + 1))
predictionsDf["Label"] = predictionsDf["Label"].astype(int)

In [6]:
predictionsDf

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [7]:
predictionsDf.to_csv('/home/rainer/Downloads/ML_datasets/mnist_test_prediction.csv', index=False)