Problem statement

For this competition, you will be predicting a binary target based on a number of feature columns given in the data. All of the feature columns, cat0 - cat18 are categorical, and the feature columns cont0 - cont10 are continuous.

The dataset is used for this competition is synthetic but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the amount of an insurance claim. Although the features are anonymized, they have properties relating to real-world features.

Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Load datasets

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
train


In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")
test


In [None]:
sample = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv")
sample

Analyse target

In [None]:
target_count = train.groupby('target').target.count()
target_count

In [None]:
percent_target = (target_count / len(train)) * 100
percent_target

In [None]:
train.groupby('target').target.count().plot.bar(ylim=0)
plt.show()

Merge train and test

In [None]:
target = train.target

train.drop(['target'], axis=1, inplace=True)

test_id = test.id
train_length = len(train)


frames = [train, test]
combo = pd.concat(frames)
combo

Drop id

In [None]:
combo.drop(['id'], axis=1, inplace=True)
combo

Check for missing values

In [None]:
combo.isnull().sum()

Analytics

In [None]:
corrmat = combo.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
combo.info()

In [None]:
combo.describe()

Ordinal encode

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

combo.cat0 = enc.fit_transform(combo.cat0.values.reshape(-1,1))
combo.cat1 = enc.fit_transform(combo.cat1.values.reshape(-1,1))
combo.cat2 = enc.fit_transform(combo.cat2.values.reshape(-1,1))
combo.cat3 = enc.fit_transform(combo.cat3.values.reshape(-1,1))
combo.cat4 = enc.fit_transform(combo.cat4.values.reshape(-1,1))
combo.cat5 = enc.fit_transform(combo.cat5.values.reshape(-1,1))
combo.cat6 = enc.fit_transform(combo.cat6.values.reshape(-1,1))
combo.cat7 = enc.fit_transform(combo.cat7.values.reshape(-1,1))
combo.cat8 = enc.fit_transform(combo.cat8.values.reshape(-1,1))
combo.cat9 = enc.fit_transform(combo.cat9.values.reshape(-1,1))
combo.cat10 = enc.fit_transform(combo.cat10.values.reshape(-1,1))
combo.cat11 = enc.fit_transform(combo.cat11.values.reshape(-1,1))
combo.cat12 = enc.fit_transform(combo.cat12.values.reshape(-1,1))
combo.cat13 = enc.fit_transform(combo.cat13.values.reshape(-1,1))
combo.cat14 = enc.fit_transform(combo.cat14.values.reshape(-1,1))
combo.cat15 = enc.fit_transform(combo.cat15.values.reshape(-1,1))
combo.cat16 = enc.fit_transform(combo.cat16.values.reshape(-1,1))
combo.cat17 = enc.fit_transform(combo.cat17.values.reshape(-1,1))
combo.cat18 = enc.fit_transform(combo.cat18.values.reshape(-1,1))
combo

Normalise combo

In [None]:
combo = (combo - combo.min()) / (combo.max() - combo.min())
combo

Define X and y

In [None]:
y = target
X = combo[:train_length]
X_test = combo[train_length:]

In [None]:
y

In [None]:
X

In [None]:
X_test

Split X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.1, shuffle=True, random_state=1)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape

Select Model

In [None]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(C=100, class_weight='balanced', random_state=1, max_iter=10000).fit(X_train, y_train)
print(model.score(X_train, y_train))

In [None]:
#from sklearn.experimental import enable_hist_gradient_boosting  
#from sklearn.ensemble import HistGradientBoostingClassifier

#model = HistGradientBoostingClassifier(max_iter=50000, random_state=1).fit(X_train, y_train)
#print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val,y_val))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

Predict on test set

In [None]:
predictions = model.predict(X_test)

Prepare submission

In [None]:
output = pd.DataFrame({'id': test_id, 'target': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
#upload submission
submission = pd.read_csv("submission.csv")
submission