In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#from lightgbm import LGBMClassifier
#import lightgbm as lgb
#import optuna.integration.lightgbm as lgb

#from xgboost import XGBClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', 100)

# 1.Import data

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")

# 2. Preprocessing

In [None]:
sample_submission

In [None]:
train

In [None]:
test

In [None]:
train = train.drop(columns=["id"])
test = test.drop(columns=["id"])

In [None]:
# Search for missing data

msno.matrix(df=train, figsize=(10,6), color=(0,.3,.3))

In [None]:
# Search for missing data

msno.matrix(df=test, figsize=(10,6), color=(0,.3,.3))

In [None]:
plt.figure(figsize=(10,6))
#sns.countplot(x='target', data=train, order=df_train['target'].value_counts().index)
sns.countplot(x='target', data=train, order=sorted(train['target'].unique()))

In [None]:
train.drop(columns=['target']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
test.describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
#train['target'] = train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3, 'Class_5':4, 'Class_6':5, 'Class_7':6, 'Class_8':7, 'Class_9':8})
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

In [None]:
train

# 3. Check the correlation between each item

In [None]:
train_corr = train.corr()
train_corr

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train_corr, vmin=0, vmax=0.12, center=0, square=False, annot=False, cmap='coolwarm');

# 4. Modeling

In [None]:
X = train.drop('target',axis=1)
y = train['target']

In [None]:
# split data for train and test
x_train, x_test, t_train, t_test = train_test_split(X, y, test_size=0.2, random_state=43)# (0.2) (7,43)1～43

In [None]:
# XGBoost
dtrain = xgb.DMatrix(x_train, label=t_train)
dtest = xgb.DMatrix(x_test, label=t_test)
xgb_params= {
        'objective': 'multi:softprob',# 多値分類問題(multi:softprob：各クラスに属する確率、multi:softmax：予測したクラス)
        'num_class': 9,
        'eval_metric': 'mlogloss',
        'max_depth': 9,
        'learning_rate': 0.0201,
        'reg_lambda': 29.326,
        'subsample': 0.818,
        'colsample_bytree': 0.235,
        'colsample_bynode': 0.82,
        'colsample_bylevel': 0.453}

In [None]:
# Training
evals = [(dtrain, 'train'), (dtest, 'eval')]
evals_result = {}
bst = xgb.train(xgb_params,
                dtrain,
                num_boost_round=10000,
                early_stopping_rounds=10,
                evals=evals,
                evals_result=evals_result,
                verbose_eval=10
                )

In [None]:
pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
pred_max = np.argmax(pred, axis=1)

# Accuracy
acc = accuracy_score(t_test, pred_max)
print('Accuracy:', acc)

In [None]:
# Feature importance
fig, ax = plt.subplots(figsize=(10, 10))
xgb.plot_importance(bst, ax=ax)

In [None]:
# Training performance
plt.plot(evals_result['train']['mlogloss'], label='train')
plt.plot(evals_result['eval']['mlogloss'], label='eval')
plt.ylabel('Log loss')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()
plt.show()

# 5.Prediction

In [None]:
testData = pd.DataFrame(test)
testData = xgb.DMatrix(testData)

In [None]:
pred = bst.predict(testData)

# 6.Make submission file

In [None]:
sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] = pred
sample_submission.to_csv(f'xgb.csv',index=False)

In [None]:
sample_submission