In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb
#import optuna.integration.lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', 100)

# 1. Import data

In [None]:
sumple_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")

# 2. Preprocessing

In [None]:
sumple_submission

In [None]:
df_train = pd.DataFrame(train)
df_train

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='target', data=df_train, order=df_train['target'].value_counts().index)

In [None]:
#df_train.info()

In [None]:
df_train.drop(columns=['id', 'target']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
df_test = pd.DataFrame(test)
df_test

In [None]:
#df_test.info()

In [None]:
df_test.drop(columns=['id']).describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
df_train['target'] = df_train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3})

In [None]:
df_train

In [None]:
df_test

# 3. Check the correlation between each item

In [None]:
df_train_corr = df_train.corr()
df_train_corr

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df_train_corr, vmin=-0.03, vmax=0.03, center=0, square=False, annot=False, cmap='coolwarm');

# 4. Extract items with high correlation coefficient

In [None]:
predictor_cols = []
for i in df_train_corr:
    innerName = df_train_corr[i].name
    if df_train_corr[i]['target'] > 0 or df_train_corr[i]['target'] < -0:
        if innerName != 'id' and innerName != 'target':
            predictor_cols.append(innerName)
#predictor_cols

# 5. Modeling

In [None]:
x = pd.DataFrame(df_train[predictor_cols])
t = pd.DataFrame(df_train['target'])

x = np.array(x)
t = np.array(t)
t = t.ravel()

x = x.astype('float32')
t = t.astype('int32')

In [None]:
# split data for train and test
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=0.2, random_state=128)# (0.2) (29,41,55,68,70,122,128,155)

In [None]:
# XGBoost
dtrain = xgb.DMatrix(x_train, label=t_train)
dtest = xgb.DMatrix(x_test, label=t_test)
xgb_params = {
        # 多値分類問題(multi:softprob：各クラスに属する確率、multi:softmax：予測したクラス)
        'objective': 'multi:softprob',
        'num_class': 4,
        'learning_rate': 0.05,
        'eval_metric': 'mlogloss',
        'subsample': 0.5,# 1
        'colsample_bytree': 1,# 1
        'colsample_bylevel': 1,# 1
        'eta': 0.3,# 0.3
        'reg_alpha': 0.7,# 0
        'reg_lambda': 0.9,# 1
        'max_depth': 3,# 6
        'min_child_weight': 1# 1
    }
 

In [None]:
# Training
evals = [(dtrain, 'train'), (dtest, 'eval')]
evals_result = {}
bst = xgb.train(xgb_params,
                dtrain,
                num_boost_round=10000,
                early_stopping_rounds=10,
                evals=evals,
                evals_result=evals_result,
                verbose_eval=10
                )

In [None]:
pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
pred_max = np.argmax(pred, axis=1)

# Accuracy
acc = accuracy_score(t_test, pred_max)
print('Accuracy:', acc)

In [None]:
# Feature importance
fig, ax = plt.subplots(figsize=(10, 10))
xgb.plot_importance(bst, ax=ax)

In [None]:
# Training performance
plt.plot(evals_result['train']['mlogloss'], label='train')
plt.plot(evals_result['eval']['mlogloss'], label='eval')
plt.ylabel('Log loss')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()
plt.show()

# 6. Prediction

In [None]:
testData = pd.DataFrame(df_test[predictor_cols])
testData = np.array(testData)
testData = testData.astype('float32')
testData = xgb.DMatrix(testData)

In [None]:
result = bst.predict(testData)

# 7. Make submission file

In [None]:
outputArray = []
id = 100000
for i in range(len(result)):
    predict = result[i]
    innerArray = [id, predict[0], predict[1], predict[2], predict[3]]
    outputArray.append(innerArray)
    id += 1
df = pd.DataFrame(outputArray, columns=['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4'])
df.to_csv(path_or_buf='submission.csv', index=False)
df