For this competition, you will be predicting a binary target based on 100 feature columns given in the data. All columns are continuous.

The data is synthetically generated by a GAN that was trained on a real-world dataset used to identify spam emails via various extracted features from the email.

Import, load and read

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse target

In [None]:
sns.displot(train.target)

In [None]:
train.target.value_counts()

Drop target and create combined dataframe

In [None]:
target = train.target

train_copy = train.copy()

combi = train_copy.drop('target', axis=1).append(test)
combi

Check for null values

In [None]:
combi.isnull().sum().sum()

Analyse combi

In [None]:
combi.info()

In [None]:
combi.describe()

Drop ID because it adds no value to prediction

In [None]:
combi.drop('id', axis=1, inplace=True)
combi

Normalise

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi

Standardise

In [None]:
#combi = (combi - combi.mean()) / combi.std()
#combi

Define X, y and X_test

In [None]:
y = target
X = combi[: len(train)]
X_test = combi[len(train) :]

Feature selection

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2

transformer = SelectPercentile(chi2, percentile=25)

X = transformer.fit_transform(X, y)
X_test = transformer.transform(X_test)

X.shape, X_test.shape

Split

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

Select model

In [None]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(Cs=50, cv=5, random_state=42).fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

Predict on X_test

In [None]:
prediction = model.predict(X_test)

Prepare prediction for submission

In [None]:
submission.target = prediction

submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission
