Problem Statement

The dataset is used for this competition is synthetic but based on a real dataset (in this case, the actual Titanic data!) and generated using a CTGAN. The statistical properties of this dataset are very similar to the original Titanic dataset, but there's no way to "cheat" by using public labels for predictions. How well does your model perform on truly private test labels?

Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Load and Read

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
train

In [None]:
test

In [None]:
submission

Plot survivors

In [None]:
plt.figure(figsize=(25, 7))
ax = plt.subplot()
ax.scatter(train[train['Survived'] == 1]['Age'], train[train['Survived'] == 1]['Fare'], c='green', s=train[train['Survived'] == 1]['Fare'])
ax.scatter(train[train['Survived'] == 0]['Age'], train[train['Survived'] == 0]['Fare'], c='red', s=train[train['Survived'] == 0]['Fare']);

Analyse survived

In [None]:
train.groupby('Survived').Survived.count().plot.bar(ylim=0)
plt.show()

Filter survivors

In [None]:
include = train[train['Survived'].values == 1]
exclude = train[train['Survived'].values != 1]
survived = include
survived

In [None]:
survived_sex = survived.groupby('Sex').Survived.count()
survived_sex

In [None]:
survived.groupby('Sex').Survived.count().plot.bar(ylim=0)
plt.show()

In [None]:
survived_pclass = survived.groupby('Pclass').Survived.count()
survived_pclass

In [None]:
survived.groupby('Pclass').Survived.count().plot.bar(ylim=0)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.title("Ages Frequency")
sns.axes_style("dark")
sns.violinplot(y=survived["Age"])
plt.show()

In [None]:
survived_median_age = survived.Age.median()
survived_median_age

In [None]:
plt.figure(figsize=(10,6))
plt.title("Fare Frequency")
sns.axes_style("dark")
sns.violinplot(y=survived["Fare"])
plt.show()

In [None]:
survived_median_fare = survived.Fare.median()
survived_median_fare

In [None]:
survived.groupby('Embarked').Survived.count().plot.bar(ylim=0)
plt.show()

In [None]:
survived_embark = survived.groupby('Embarked').Survived.count()
survived_embark

Drop survived

In [None]:
survive = train.Survived
train = train.drop('Survived', axis=1)
train

Merge train and test

In [None]:
combi = pd.concat([train, test])
combi

Drop name, ticket, cabin

In [None]:
combi = combi.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis = 1)
combi

Check for null values

In [None]:
combi.isnull().sum()

Impute missing values

In [None]:
combi.Age.fillna(combi.Age.mode()[0], inplace=True)
combi.Fare.fillna(combi.Fare.mode()[0], inplace=True)
combi.Embarked.fillna(combi.Embarked.mode()[0], inplace=True)

In [None]:
combi.isnull().sum()

Ordinal encode categorical columns

In [None]:
sex1={'male':0, 'female':1}
combi.Sex = combi.Sex.map(sex1)

In [None]:
embark1 = {'C': 1, 'Q': 2, 'S': 3}
combi.Embarked = combi.Embarked.map(embark1)

Convert age and fare to integar

In [None]:
combi.Age = combi.Age.astype(int)
combi.Fare = combi.Fare.astype(int)

Normalise combi

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi

Define X and y

In [None]:
y = survive
X = combi[: len(train)]
X_test = combi[len(train):]


Split X for training and validation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.1, shuffle=True, random_state=1, stratify=y)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape

Select model

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=.01, max_iter=50000, random_state=1).fit(X_train, y_train)
model.score(X_train, y_train)

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
model.score(X_val, y_val)

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, y_pred)

Predict on test set

In [None]:
prediction = model.predict(X_test)
prediction

Prepare submission

In [None]:
submission.Survived = prediction
submission

In [None]:
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission