Problem statement

The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting whether a claim will be made on an insurance policy. Although the features are anonymized, they have properties relating to real-world features. 

Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Load

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read

In [None]:
long_train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
train = long_train[: 500000]

In [None]:
test

In [None]:
submission

Analyse claim

In [None]:
print(train['claim'].value_counts())

In [None]:
sns.displot(train['claim'])

Drop claim from train

In [None]:
target = train['claim']

train.drop(['claim'], axis=1, inplace=True)

Combine train and test

In [None]:
combi = train.append(test).reset_index()
combi.drop(['index'], axis=1, inplace=True)
combi

Drop id from combi

In [None]:
combi.drop(['id'], axis=1, inplace=True)
combi

In [None]:
combi.describe()

Check missing values

In [None]:
combi.isnull().sum().sum()

Impute missing values

In [None]:
for col in combi:
    if combi[col].dtype=="object":
        combi[col] = combi[col].replace(np.NaN, combi[col].mode()[0])
    if combi[col].dtype == 'int':
        combi[col] = combi[col].replace(np.NaN, combi[col].mode()[0])
    if combi[col].dtype == 'float':
        combi[col] = combi[col].replace(np.NaN, combi[col].median())
combi

In [None]:
combi.isnull().sum().sum()

Normalise

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi

Define X and y

In [None]:
y = target

X = combi[: len(train)]
X_test = combi[len(train) :]

SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

skb = SelectKBest(chi2, k=65)

X = skb.fit_transform(X, y)
X_test = skb.transform(X_test)
X.shape, y.shape, X_test.shape

Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1, stratify=y)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

Select model

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(max_iter=25000, random_state=1).fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

In [None]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Predict on test set

In [None]:
pred = model.predict(X_test)

Prepare submission

In [None]:
submission.claim = pred

submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission