In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.cluster import MiniBatchKMeans


# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv", low_memory=False)#, nrows=10000)
# train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv", low_memory=False)
# test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

# **EDA**

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
train.head()

In [None]:
train.columns.values

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
train.describe().T

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

bars = ax.bar(train["claim"].value_counts().index,
              train["claim"].value_counts().values,
              color=colors,
              edgecolor="black",
              width=0.4)
ax.set_title("Claim (target) values distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Claim (target) value", fontsize=14, labelpad=10)
ax.set_xticks(train["claim"].value_counts().index)
ax.tick_params(axis="both", labelsize=14)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["claim"].value_counts().values/(len(train)/100)],
                 padding=5, fontsize=15)
ax.bar_label(bars, [f"{x:2d}" for x in train["claim"].value_counts().values],
                 padding=-30, fontsize=15)
ax.margins(0.2, 0.12)
ax.grid(axis="y")

plt.show();

The target value classes are balanced which is good.

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))

bars = ax.bar(train.isna().sum().index,
              train.isna().sum().values,
              color="lightskyblue",
              edgecolor="black",
              width=0.7)
ax.set_title("Missing feature values distribution in the train dataset", fontsize=20, pad=15)
ax.set_ylabel("Missing values", fontsize=14, labelpad=15)
ax.set_xlabel("Feature", fontsize=14, labelpad=10)
ax.set_xticks([x if i%2==0 else "" for i, x in enumerate(train.columns.values)])
ax.tick_params(axis="x", rotation=90, labelsize=8)
ax.margins(0.005, 0.12)
ax.grid(axis="y")

plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))

bars = ax.bar(test.isna().sum().index,
              test.isna().sum().values,
              color="lightsteelblue",
              edgecolor="black",
              width=0.7)
ax.set_title("Missing feature values distributionin in the test dataset", fontsize=20, pad=15)
ax.set_ylabel("Missing values", fontsize=14, labelpad=15)
ax.set_xlabel("Feature", fontsize=14, labelpad=10)
ax.set_xticks([x if i%2==0 else "" for i, x in enumerate(test.columns.values)])
ax.tick_params(axis="x", rotation=90, labelsize=8)
ax.margins(0.005, 0.12)
ax.grid(axis="y")

plt.show();

As you can see, both train and test datasets have missing values in every feature excepth for "id" and "claim". We should take care with them.

Let's check feature values distribution in the both datasets.

In [None]:
df = pd.concat([train.drop(["id", "claim"], axis=1), test.drop("id", axis=1)], axis=0)
columns = df.columns.values

cols = 4
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,130), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1
#plt.suptitle("Feature values distribution in both datasets", y=0.99)
plt.show();

As you can see, the datasets are well balanced. So target distribution should probably be the same for test predictions.

In [None]:
print("Features with the leas amount of unique values:")
train.drop(["id", "claim"], axis=1).nunique().sort_values().head(5)

There are no categorical features in the dataset.

Let's look at feature correlation.

In [None]:
# Plot dataframe
df = train.drop("id", axis=1).corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(df, annot=False, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

There is very weak linear correlation between the features. There are some features with relatively low correlation with target value even comparing with other features:

In [None]:
df[(df["claim"]>-0.001) & (df["claim"]<0.001)]["claim"]

Probably worth a try to drop them and check if it improves the result.

# **Data preprocessing**

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

The idea of adding a new feature below is taken from [this notebook](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm) by [BIZEN](https://www.kaggle.com/hiro5299834).

In [None]:
# Counting amount of missing values in each row and adding it as a new feature
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

In [None]:
# Filling missing values with median of each column
imputer = SimpleImputer(strategy="median")
for col in features:
    train[col] = imputer.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = imputer.transform(np.array(test[col]).reshape(-1,1))

In [None]:
# Scaling all values to [0,1] range
scaler = StandardScaler()
for col in features:
    train[col] = scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = scaler.transform(np.array(test[col]).reshape(-1,1))

In [None]:
X = train.drop(["id", "claim"], axis=1)
X_test = test.drop("id", axis=1)
y = train["claim"]

# **Model training**

In [None]:
# Model hyperparameters
xgb_params = {'objective': 'binary:logistic',
              'use_label_encoder': False,
              'n_estimators': 2600,
              'learning_rate': 0.04,
              'subsample': 0.66,
              'colsample_bytree': 0.1,
              'max_depth': 8,
              'booster': 'gbtree',
              'gamma': 5.5,
              'reg_alpha': 81.8,
              'reg_lambda': 72.0,
              'random_state': 42,
              'tree_method': 'gpu_hist',
              'n_jobs': 4}

In [None]:
%%time
# Training a model on a full dataset
model = XGBClassifier(**xgb_params)
model.fit(X, y,
          verbose=False)
# Making probability of class "1" predictions
preds = model.predict_proba(X_test)[:, 1]

# **Feature importances**

In [None]:
# Making a DataFrame with feature importances
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = X.columns
df["Importance"] = model.feature_importances_ / model.feature_importances_.sum()
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

In [None]:
x = np.arange(0, len(df["Feature"]))
height = 0.4

fig, ax = plt.subplots(figsize=(16, 30))
bars1 = ax.barh(x, df["Importance"], height=height,
                color="mediumorchid", edgecolor="black")
ax.set_title("Feature importances", fontsize=30, pad=15)
ax.set_ylabel("Feature names", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=15)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax2.tick_params(axis="x", labelsize=15)
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

# **Predictions submission**

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test["id"]
predictions["claim"] = preds

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()