![](https://i.imgur.com/bGpKLYh.png)

## [TPS May 2021] EDA + XGBOOST ##


* #### **Import Library** ####
* #### **Extract main colors** ####
* #### **Read Dataset** ####
* #### **EDA** ####
    - Target value distribution
    - Train and Test Dataset
    - Comparison Train and Test
    - Check null data
    - Distribution
    - Correlation
    - Umap
* #### **XGBOOST** ####
    - Trainig
    - Feature Importance
    - Submission

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

## Extract main 4 colors from TPS May Poster

In this competition, There are 4 classes using a total of 50 integer features. For color consistency on my notebook, extract main 4 colors from TPS May 2021 poster

In [None]:
from skimage import io
from sklearn.cluster import KMeans

colors = []

img = io.imread('https://i.imgur.com/bGpKLYh.png')[:,:,:3]
img = img.reshape((img.shape[0] * img.shape[1], 3))

k = 10
clt = KMeans(n_clusters = k)
clt.fit(img)

for center in clt.cluster_centers_:
    color = [int(i) for i in list(center)]
    colors.append('#%02x%02x%02x' % (color[0], color[1], color[2]))

sns.palplot(colors)
plt.axis('off')
print(colors)

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors), size=0.8)
    plt.tick_params(axis='both', labelsize=0, length=0)

main_colors = ['#f03aa5', '#40c2f3', '#c489ce', '#bb3ca9']
custom_palette(main_colors)

## Read Dataset ##

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
print('Shape of train dataset : ', train.shape)
print('Shape of test dataset : ', test.shape)

In [None]:
train.head()

In [None]:
test.tail()

## EDA(Exploratory Data Analysis) ##

#### Target Value Distribution

In [None]:
labels = list(train['target'].unique())
data = list(train['target'].value_counts())

plt.figure(figsize=(8,8))
plt.pie(data, autopct='%1.1f%%', labels=labels, textprops={'fontsize':15, 'color':'#505050'})

my_circle = plt.Circle((0,0), 0.8, color='white')
p = plt.gcf()
p.gca().add_artist(my_circle)

plt.legend(labels, loc='upper right', prop={'size':12})
plt.show()

In [None]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

#### Making CMAP from main_colors

In [None]:
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

cmap = mpl.colors.LinearSegmentedColormap.from_list("", [main_colors[0], main_colors[1]])
cmap

#### Train Dataset Describe

In [None]:
train.describe().T.style.bar(subset=['mean'], color=main_colors[0])\
                            .background_gradient(subset=['std'], cmap=cmap)\
                            .background_gradient(subset=['50%'], cmap=cmap)

#### Test Dataset Describe

In [None]:
test.describe().T.style.bar(subset=['mean'], color=main_colors[1])\
                            .background_gradient(subset=['std'], cmap=cmap)\
                            .background_gradient(subset=['50%'], cmap=cmap)

#### Comparison of Statistics of Train dataset and Test Dataset

In [None]:
def diff_color(x):
    color = main_colors[2] if x < 0 else (main_colors[3] if x > 0 else 'black')
    return f'color: {color}'

(train.describe() - test.describe())[test.columns].T.iloc[:-1, 1:].style.bar\
(subset=['mean', 'std'], align='mid', color=[main_colors[0], main_colors[1]])\
.applymap(diff_color, subset=['min', 'max'])

#### Check missing data

None

#### Distribution Check

In [None]:
plt.style.use('seaborn-notebook')

f, ax = plt.subplots(2, 2, figsize=(18, 9))
sns.kdeplot(x='feature_1', hue='target', data=train, alpha=0.2, linewidth=0.6, fill=True,
           legend=True, ax=ax[0][0])
sns.histplot(x='feature_1', hue='target', data=train, palette=main_colors, multiple='stack',
             ax=ax[0][1])
sns.histplot(x='feature_1', hue='target', data=train, kde=True, palette=main_colors, ax=ax[1][0])
sns.countplot(x='feature_1', hue='target', data=train, palette=main_colors, ax=ax[1][1])
ax[1][1].legend(title='target', loc='upper right')

In [None]:
fig, axes = plt.subplots(17, 3, figsize=(18, 54))

target_order = sorted(train['target'].unique())
for idx, ax in zip(range(50), axes.flatten()):
    cnt = train['feature_{}'.format(idx)].value_counts().sort_index()
    sns.kdeplot(x='feature_{}'.format(idx), hue='target', hue_order=labels, palette=main_colors,
               data=train, alpha=0.5, linewidth=0.6, fill=True, legend=False, ax=ax,)
    
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title('Feature_{}, Unique Values: {}'.format(idx, len(train['feature_{}'.format(idx)].unique())),
                 loc='right', weight='bold', fontsize=12)
    ax.axis('off')
    
axes.flatten()[-1].axis('off')
axes.flatten()[-2].axis('off')

fig.tight_layout()
plt.show()

In [None]:
unique_train = pd.DataFrame(train.nunique()).reset_index()
unique_train.columns = ['features', 'count']
unique_train.drop(50, axis=0, inplace=True)
cols = train.columns[:-1]
train_counts = unique_train['count'].values

f, ax = plt.subplots(1, 1, figsize=(18,9))
sns.barplot(data=unique_train, x=cols, y='count', ax=ax, color=main_colors[0])
plt.xticks(rotation=90)
plt.title('Train Dataset Features Unique Values', fontsize=20)

for i, c in enumerate(train_counts):
    plt.text(x=i-0.3, y=c+1, s=c)
plt.show()

In [None]:
unique_test = pd.DataFrame(test.nunique()).reset_index()
unique_total = pd.concat([unique_train, unique_test])
unique_test.columns = ['features', 'count']
cols = test.columns
test_counts = unique_test['count'].values
compare_counts = train_counts - test_counts

f, ax = plt.subplots(1, 1, figsize=(18,9))
sns.barplot(data=unique_train, x=cols, y='count', color=main_colors[0], label='Train', alpha=1)
sns.barplot(data=unique_test, x=cols, y='count', color=main_colors[1], label='Test', alpha=0.9)

plt.xticks(rotation=90)
plt.title('Test Dataset Features Unique Values compared to Train Dataset', fontsize=20)

for i, c in enumerate(test_counts):
    plt.text(x=i-0.3, y=c+1, s=c)
plt.show()

In [None]:
train['target'] = train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3})

In [None]:
mean_train = pd.DataFrame(index=cols)

for i in range(4):
    mean_train['Class_{}'.format(i)] = np.array(train.loc[train['target']==i, :].mean()[:-1])

mean_train.T

In [None]:
f, ax = plt.subplots(1, 1, figsize=(18,9))
sns.lineplot(data=mean_train, palette=main_colors)
plt.title('Train Features Mean', fontsize=20)
plt.xticks(rotation=90)
plt.show()

#### Correlation

In [None]:
plt.figure(figsize=(18,18))
corr = train.corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, mask=mask, cmap=cmap, square=True, linewidths=0.5, robust=True, center=0)
plt.show()

#### Umap

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import umap

In [None]:
reducer = umap.UMAP()
train_u = train.drop('target', axis=1).values
target_u = train['target']
scaled_train_u = StandardScaler().fit_transform(train_u)
embedding = reducer.fit_transform(scaled_train_u, target_u)

f, ax = plt.subplots(1, 1, figsize=(18,18))
sns.scatterplot(embedding[:, 0], embedding[:, 1], s=5.0,
                hue='target', data=train, palette=main_colors, ax=ax)
ax.legend(['Class_1', 'Class_2', 'Class_3', 'Class_4'], fontsize=15)
plt.title('Umap_fit_transfrom with target data', fontsize=20)
plt.show()

In [None]:
f, ax = plt.subplots(2, 2, figsize=(18, 18))
sns.scatterplot(embedding[:, 0][target_u==0], embedding[:, 1][target_u==0], s=5.0,
                data=train.loc[train['target']==0, :], color=main_colors[0], ax=ax[0][0])
sns.scatterplot(embedding[:, 0][target_u==1], embedding[:, 1][target_u==1], s=5.0,
                data=train.loc[train['target']==1, :], color=main_colors[1], ax=ax[0][1])
sns.scatterplot(embedding[:, 0][target_u==2], embedding[:, 1][target_u==2], s=5.0,
                data=train.loc[train['target']==2, :], color=main_colors[2], ax=ax[1][0])
sns.scatterplot(embedding[:, 0][target_u==3], embedding[:, 1][target_u==3], s=5.0,
                data=train.loc[train['target']==3, :], color=main_colors[3], ax=ax[1][1])
ax[0][0].set_title('Class_1', fontsize=15)
ax[0][1].set_title('Class_2', fontsize=15)
ax[1][0].set_title('Class_3', fontsize=15)
ax[1][1].set_title('Class_4', fontsize=15)
plt.show()

In [None]:
scaled_train_u2 = StandardScaler().fit_transform(train_u)
embedding2 = reducer.fit_transform(scaled_train_u2)

f, ax = plt.subplots(1, 1, figsize=(18,18))
sns.scatterplot(embedding2[:, 0], embedding2[:, 1], s=5.0,
                hue='target', data=train, palette=main_colors, ax=ax)
ax.legend(['Class_1', 'Class_2', 'Class_3', 'Class_4'], fontsize=15)
plt.title('Umap_fit_transfrom without target data')
plt.show()

## XGBOOST

#### Training

In [None]:
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold

import optuna
from optuna import Trial, visualization

import warnings
warnings.filterwarnings("ignore")

In [None]:
X = train.drop('target', axis=1)
y = train['target']

In [None]:
model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False)
model.fit(X, y)

#### Feature Importance

In [None]:
f, ax = plt.subplots(1, 1, figsize=(18, 18))
plot_importance(model, color=main_colors[0], ax=ax)
plt.title('Feature Importance', fontsize=20)
plt.show()

#### Submission

In [None]:
model.predict_proba(test)

In [None]:
submission_xgb = pd.DataFrame(model.predict_proba(test), columns=['Class_1','Class_2','Class_3','Class_4'])
submission_xgb['id'] = submission['id']
submission_xgb

In [None]:
submission_xgb.to_csv('./submission_xgb.csv', index=False)