# 0. Setting

## 0.1 Calling Basic Libraries

In [None]:
# import basic library
from sklearn.impute import SimpleImputer
from IPython.display import display
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier

## 0.2 Data Setting

In [None]:
# import data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import train & test data
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
# information about test and train data
display(train.info())
display(test.info())

# 1. EDA

## 1.1 Skimming the Data sets

In [None]:
# basic structure of train data
train.head()

- 120 columns in train data

 > The dataset includes 118 features and one target variable, 'claim'.

In [None]:
# basic structure of train data 2
train.describe().T

- It is vague to understand what features are through the describe.

## 1.2 Cheacking the Missing Values

In [None]:
print(" train data")
print(f' Number of rows: {train.shape[0]}\n Number of columns: {train.shape[1]}\n No. of missing values: {sum(train.isna().sum())}')

In [None]:
print(" test data")
print(f' Number of rows: {test.shape[0]}\n Number of columns: {test.shape[1]}\n No. of missing values: {sum(test.isna().sum())}')

- The training set has 1,820,782 missing values.
- The testing set has 936,218 missing values.

In [None]:
# number of misssing values by feature
print("number of misssing values by feature")
train.isnull().sum().sort_values(ascending = False)

In [None]:
# train_data missing values
null_values_train = []
for col in train.columns:
    c = train[col].isna().sum()
    pc = np.round((100 * (c)/len(train)), 2)            
    dict1 ={
        'Features' : col,
        'null_train (count)': c,
        'null_trian (%)': '{}%'.format(pc)
    }
    null_values_train.append(dict1)
DF1 = pd.DataFrame(null_values_train, index=None).sort_values(by='null_train (count)',ascending=False)


# test_data missing values
null_values_test = []
for col in test.columns:
    c = test[col].isna().sum()
    pc = np.round((100 * (c)/len(test)), 2)            
    dict2 ={
        'Features' : col,
        'null_test (count)': c,
        'null_test (%)': '{}%'.format(pc)
    }
    null_values_test.append(dict2)
DF2 = pd.DataFrame(null_values_test, index=None).sort_values(by='null_test (count)',ascending=False)


df = pd.concat([DF1, DF2], axis=1)
df

- It seems like every feature has approximatley same number of missing values.

In [None]:
df = pd.DataFrame()
df["n_missing"] = train.drop(["id", "claim"], axis=1).isna().sum(axis=1)
df["claim"] = train["claim"].copy()

fig, ax = plt.subplots(figsize=(12,5))
ax.hist(df[df["claim"]==0]["n_missing"],
        bins=10, edgecolor="black",
        color="darkseagreen", alpha=0.7, label="claim is 0")
ax.hist(df[df["claim"]==1]["n_missing"],
        bins=10, edgecolor="black",
        color="darkorange", alpha=0.7, label="claim is 1")
ax.set_title("Missing Values Distributionin in Each Target Class", fontsize=20, pad=15)
ax.set_xlabel("Missing Values Per Row", fontsize=14, labelpad=10)
ax.set_ylabel("Number of Rows", fontsize=14, labelpad=10)
ax.legend(fontsize=14)
plt.show();

- The plot shows that the rows have missing values and claim = 0 is skewed to the first few rows.
- The rows have missing values and claim = 1 are more likely distributed then claim = 0.

In [None]:
# looking at Claim column
fig, ax = plt.subplots(figsize=(6, 6))

bars = ax.bar(train["claim"].value_counts().index,
              train["claim"].value_counts().values,              
              edgecolor="black",
              width=0.4)
ax.set_title("Claim (target) values distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Claim (target) value", fontsize=14, labelpad=10)
ax.set_xticks(train["claim"].value_counts().index)
ax.tick_params(axis="both", labelsize=14)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["claim"].value_counts().values/(len(train)/100)],
                 padding=5, fontsize=15)
ax.bar_label(bars, [f"{x:2d}" for x in train["claim"].value_counts().values],
                 padding=-30, fontsize=15)
ax.margins(0.2, 0.12)
ax.grid(axis="y")

plt.show();

- Before the Nan-values are dropped, 'claim' = 0 and 1 have approximately have same number of rows.

In [None]:
# proportion of no null in each row
train1 = train[train.isna().sum(axis=1)==0]
print("proportion of no null data : %.2f" %(len(train1)/len(train)*100))
print("number of claim 1 in no null data : %d" %(len(train1[train1['claim']==0])))
print("number of claim 0 in no null data : %d" %(len(train1[train1['claim']==1])))

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

bars = ax.bar(train1["claim"].value_counts().index,
              train1["claim"].value_counts().values,              
              edgecolor="black",
              width=0.4)
ax.set_title("Claim (target) values distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Claim (target) value", fontsize=14, labelpad=10)
ax.set_xticks(train1["claim"].value_counts().index)
ax.tick_params(axis="both", labelsize=14)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train1["claim"].value_counts().values/(len(train1)/100)],
                 padding=5, fontsize=15)
ax.bar_label(bars, [f"{x:2d}" for x in train1["claim"].value_counts().values],
                 padding=-30, fontsize=15)
ax.margins(0.2, 0.12)
ax.grid(axis="y")

plt.show();

- However, if Nan-values are dropped, then proportion of 'claim' = 0 and 1 are vary different.
- The plot tells most of missing values are located in rows where 'claim' = 1.
- Thus, it will be inbalanced if the Nan-values are simply dropped.


## 1.3 Cheacking the Distribution of Features.

In [None]:
target = train.pop('claim')

In [None]:
train_ = train[0:9579]
test_ = test[0:4934]

In [None]:
# distribution of Features f1 to f60
L = len(train.columns[0:60])
nrow= int(np.ceil(L/6))
ncol= 6

remove_last= (nrow * ncol) - L

fig, ax = plt.subplots(nrow, ncol,figsize=(24, 30))
#ax.flat[-remove_last].set_visible(False)
fig.subplots_adjust(top=0.95)
i = 1
for feature in train.columns[0:60]:
    plt.subplot(nrow, ncol, i)
    ax = sns.kdeplot(train_[feature], shade=True, color='cyan',  alpha=0.5, label='train')
    ax = sns.kdeplot(test_[feature], shade=True, color='darkblue',  alpha=0.5, label='test')
    plt.xlabel(feature, fontsize=9)
    plt.legend()
    i += 1
plt.suptitle('DistPlot: train & test data', fontsize=20)
plt.show()

In [None]:
# distribution of Features f61 to f118
L = len(train.columns[60:])
nrow= int(np.ceil(L/6))
ncol= 6

remove_last= (nrow * ncol) - L

fig, ax = plt.subplots(nrow, ncol,figsize=(24, 30))
#ax.flat[-remove_last].set_visible(False)
fig.subplots_adjust(top=0.95)
i = 1
for feature in train.columns[60:]:
    plt.subplot(nrow, ncol, i)
    ax = sns.kdeplot(train_[feature], shade=True, color='cyan',  alpha=0.5, label='train')
    ax = sns.kdeplot(test_[feature], shade=True, color='darkblue',  alpha=0.5, label='test')
    plt.xlabel(feature, fontsize=9)
    plt.legend()
    i += 1
plt.suptitle('DistPlot: train & test data', fontsize=20)
plt.show()

- Features in both traing and testing sets have similar distribution.
- Thus, it is expected that the same imputation is going to be worked for both training snd testing sets.

## 1.4 Cheacking the Box-plots

In [None]:
# outlier of train data
df_plot = ((train - train.min())/(train.max() - train.min()))
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = df_plot.iloc[:, 1:30], ax = ax[0])
sns.boxplot(data = df_plot.iloc[:, 30:60], ax = ax[1])
sns.boxplot(data = df_plot.iloc[:, 60:90], ax = ax[2])
sns.boxplot(data = df_plot.iloc[:, 90:120], ax = ax[3])

In [None]:
# outlier of test data
df_plot = ((test - test.min())/(test.max() - test.min()))
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = df_plot.iloc[:, 1:30], ax = ax[0])
sns.boxplot(data = df_plot.iloc[:, 30:60], ax = ax[1])
sns.boxplot(data = df_plot.iloc[:, 60:90], ax = ax[2])
sns.boxplot(data = df_plot.iloc[:, 90:119], ax = ax[3])

- Boxplots show that both training and testing sets are similarly distributed.

In [None]:
# correlation of train
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

plt.figure(figsize = (15, 15))
plt.title('Corelation matrix')
sns.heatmap(corr, mask = mask, cmap = 'Spectral_r', linewidths = .5)

plt.show()

In [None]:
# correlation of train
corr = test.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

plt.figure(figsize = (15, 15))
plt.title('Corelation matrix')
sns.heatmap(corr, mask = mask, cmap = 'Spectral_r', linewidths = .5)

plt.show()

- The correlation between the two data are also similar.
- Overall, every feature in both training and testing sets are vary similar.