Problem statement

The May edition of the 2022 Tabular Playground series binary classification problem that includes a number of different feature interactions. This competition is an opportunity to explore various methods for identifying and exploiting these feature interactions.

Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read files

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse

In [None]:
train.info()

In [None]:
train.describe()

Analyse target

In [None]:
sns.distplot(train['target'])

Define target

In [None]:
target = train['target']
target

Combine train and test

In [None]:
combi = train.drop(['target'], axis=1).append(test)
combi = combi.drop(['id', 'f_27'], axis=1)
combi

Heatmap

In [None]:
corr = combi.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr, vmax=.8, square=True);

In [None]:
print(corr)

Remove columns that have a high correlation

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.80:
            if columns[j]:
                columns[j] = False
selected_columns = combi.columns[columns]
combi = combi[selected_columns]
combi

Scale data

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi

Define X and y

In [None]:
y = target
X = combi[: len(train)]
X_test = combi[len(train) :]

Split dataset for training and validation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

Select model - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42).fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

Predict on test set

In [None]:
preds = model.predict(X_test)
preds = preds.astype(int)
preds[preds < 0] = 0
preds

Submit

In [None]:
submission.target = preds
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission