# Introduction
Hello!

In this kernel you will find my approach to "Tabular Playground Series - Apr 2021" competition using neural network.

# Table of contents:

1. Meeting our data

2. Visualization and data analysis

3. Data cleaning

4. Feature engineering and encoding

5. Creating and evaluating a model

    5.1 Neural network

    5.2 Other models

    5.3 Voting ensemble

# 1. Meeting our data

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', index_col = 'PassengerId')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv', index_col = 'PassengerId')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.dtypes.unique()

In [None]:
test.dtypes.unique()

In [None]:
train.select_dtypes(include = ['object']).describe()

In [None]:
train.drop('Survived', axis = 1).select_dtypes(exclude = ['object']).describe()

In [None]:
target = train.Survived.copy()
target

In [None]:
target.isna().any()

In [None]:
target.loc[target == 1].size / target.size

In [None]:
target.describe()

In [None]:
train.drop('Survived', axis = 1).columns.equals(test.columns)

# 2. Visualization and data analysis

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_style('whitegrid')

In [None]:
plt.figure(figsize = (16, 6))
sns.countplot(x = train.Survived, palette = 'Purples_r')

In [None]:
def plot_grid(data, fig_size, grid_size, plot_type, target = ''):
    """
    Custom function for plotting grid of plots.
    It takes: DataFrame of data, size of a grid, type of plots, string name of target variable;
    And it outputs: grid of plots.
    """
    fig = plt.figure(figsize = fig_size)
    if plot_type == 'histplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.histplot(data[column_name], kde = True, color = 'blueviolet', stat = 'count')
    if plot_type == 'boxplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.boxplot(x = data[column_name], color = 'blueviolet')
    if plot_type == 'countplot':
        target = data[target]
        for i, column_name in enumerate(data.drop(target.name, axis = 1).columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.countplot(x = data[column_name], hue = target, palette = 'Purples_r')
            plot.legend(loc = 'upper right', title = target.name)
    plt.tight_layout()

In [None]:
plot_grid(train.drop('Survived', axis = 1), (16, 6), (2,3), 'histplot')

In [None]:
pd.pivot_table(train, index = 'Survived', values = ['Age', 'SibSp', 'Parch', 'Fare', 'Pclass'], aggfunc = 'mean')

In [None]:
plot_grid(train.select_dtypes(exclude = 'object').drop(['Fare', 'Age'], axis = 1), (16, 6), (1, 3), 'countplot', 'Survived')

In [None]:
print(f"{pd.pivot_table(train, index = 'Survived', columns = 'Pclass', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'SibSp', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'Parch', values = 'Name', aggfunc ='count')}")

In [None]:
plt.figure(figsize = (16, 6))
sns.heatmap(train.corr(), 
            annot = True,
            fmt = '.2f',
            square = True,
            cmap = "Purples_r", 
            mask = np.triu(train.corr()))

In [None]:
plot_grid(train.drop('Survived', axis = 1), (16, 6), (2,3), 'boxplot')

In [None]:
plot_grid(pd.concat([train.select_dtypes(include = 'object').drop(['Name', 'Ticket', 'Cabin'], axis = 1), target], axis = 1), (16, 6), (2,1), 'countplot', 'Survived')

In [None]:
print(f"{pd.pivot_table(train, index = 'Survived', columns = 'Sex', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'Embarked', values = 'Name', aggfunc ='count')}")

In [None]:
train.select_dtypes(include = 'object').nunique().sort_values(ascending = False)

# 3. Data cleaning

In [None]:
train_test = pd.concat([train.drop('Survived', axis = 1), test], keys = ['train', 'test'], axis = 0)
missing_values = pd.concat([train_test.isna().sum(),
                            (train_test.isna().sum() / train_test.shape[0]) * 100], axis = 1, 
                            keys = ['Values missing', 'Percent of missing'])
missing_values.loc[missing_values['Percent of missing'] > 0].sort_values(ascending = False, by = 'Percent of missing').style.background_gradient('Purples')

In [None]:
train_cleaning = train.drop('Survived', axis = 1).copy()
test_cleaning = test.copy()

train_cleaning['Cabin'].fillna('none', inplace = True)
test_cleaning['Cabin'].fillna('none', inplace = True)

train_cleaning['Ticket'].fillna('none', inplace = True)
test_cleaning['Ticket'].fillna('none', inplace = True)

train_cleaning['Age'].fillna(train_cleaning['Age'].median(), inplace = True)
test_cleaning['Age'].fillna(train_cleaning['Age'].median(), inplace = True)

train_cleaning['Embarked'] = train_cleaning.groupby('Pclass').Embarked.apply(lambda x: x.fillna(x.mode()[0]))
train_cleaning['Fare'] = train_cleaning.groupby('Pclass').Fare.apply(lambda x: x.fillna(x.median()))
for i in train.Pclass.unique():
    test_cleaning.loc[test.Pclass == i, 'Embarked'] = test_cleaning.loc[test.Pclass == i, 'Embarked'].fillna(train.loc[train.Pclass == i].Embarked.mode()[0])
    test_cleaning.loc[test.Pclass == i, 'Fare'] = test_cleaning.loc[test.Pclass == i, 'Fare'].fillna(train.loc[train.Pclass == i].Fare.median())

In [None]:
train_cleaning.isnull().sum().max() + test_cleaning.isnull().sum().max()

# 4. Feature engineering and encoding

In [None]:
train_test_cleaning = pd.concat([train_cleaning, test_cleaning], keys = ['train', 'test'], axis = 0)
train_test_cleaning

In [None]:
train_test_cleaning['CabinLetter'] = train_test_cleaning.Cabin.str.split().apply(lambda x: x[-1][0].strip().lower() if x[0] != 'none' else np.nan)

In [None]:
train_test_cleaning.xs('train').groupby('Pclass').CabinLetter.apply(lambda x: x.value_counts().index[0])

In [None]:
train_cleaning_new = train_test_cleaning.xs('train').copy()
test_cleaning_new = train_test_cleaning.xs('test').copy()

train_cleaning_new['CabinLetter'] = train_cleaning_new.groupby('Pclass')['CabinLetter'].apply(lambda x: x.fillna(x.mode()[0]))

for i in train.Pclass.unique():
    test_cleaning_new.loc[test_cleaning_new.Pclass == i, 'CabinLetter'] = test_cleaning_new.loc[test_cleaning_new.Pclass == i, 'CabinLetter'].fillna(train_cleaning_new.loc[train_cleaning_new.Pclass == i].CabinLetter.mode()[0])
    
train_test_cleaning = pd.concat([train_cleaning_new, test_cleaning_new], keys = ['train', 'test'], axis = 0)

In [None]:
train_test_cleaning['CabinNumbers'] = train_test_cleaning.Cabin.apply(lambda x: int(x[1:]) if x != 'none' else 0)

train_test_cleaning['TicketNumbers'] = train_test_cleaning.Ticket.apply(lambda x: int(x) if x.isnumeric() else 0 if x == 'none' else int(x.split(' ')[-1]) if (x.split(' ')[-1]).isnumeric() else 0)
train_test_cleaning['TicketLetters'] = train_test_cleaning.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.', '').replace('/', '').lower() 
                                                                        if len(x.split(' ')[:-1]) > 0 else 'none')
train_test_cleaning['TicketIsNumeric'] = train_test_cleaning.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)

train_test_cleaning['FamilySize'] = train_test_cleaning.SibSp + train_test_cleaning.Parch + 1
train_test_cleaning['FamilySize'] = train_test_cleaning['FamilySize'].apply(lambda x: 'no family' if (x == 1)
                                                                            else 'medium' if (x == 2 or x == 3)
                                                                            else 'large')

# train_test_cleaning['AgeGroup'] = train_test_cleaning['Age'].apply(lambda x: 'infant' if (x < 1) 
#                                                                    else 'child' if (x >= 1 and x <= 11)                                                                    
#                                                                    else 'teen' if (x >= 12 and x <= 17)
#                                                                    else 'adult' if (x >= 18 and x <= 64)
#                                                                    else 'adult+')

# train_test_cleaning['Surname'] = train_test_cleaning['Name'].apply(lambda x: x.split(',')[0].lower())
train_test_cleaning['Embarked'] = train_test_cleaning['Embarked'].str.lower()

In [None]:
train_test_cleaning

In [None]:
train_cleaning_target_cleaned = pd.concat([train_test_cleaning.xs('train'), target], axis = 1)
train_cleaning_target_cleaned

In [None]:
print(f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'CabinLetter', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', values = 'TicketNumbers', aggfunc = (lambda x: x.mode()[0]))} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketIsNumeric', values = 'Name', aggfunc ='count')} \n\n" +
#       f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'AgeGroup', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'FamilySize', values = 'Name', aggfunc ='count')}")

In [None]:
pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketLetters', values = 'Name', aggfunc = 'count')

In [None]:
train_cleaning_target_cleaned.select_dtypes(include = 'object').nunique().sort_values(ascending = False)

In [None]:
plot_grid(train_cleaning_target_cleaned.drop(['Survived', 'Pclass', 'TicketIsNumeric', 'SibSp', 'Parch'], axis = 1), (16, 6), (2, 3), 'histplot')

In [None]:
plot_grid(train_cleaning_target_cleaned.drop(['Name', 'Ticket', 'Cabin', 'Age', 'Fare', 'TicketNumbers', 'TicketLetters', 'CabinNumbers'],
                                             axis = 1), (16, 6), (3, 3), 'countplot', 'Survived')

In [None]:
# 'Age', 'Fare', 'TicketNumbers', 'CabinNumbers'
fig, axs = plt.subplots(2, 2, figsize = (16, 6))
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.Age, palette = {0 : 'black', 1 : 'purple'}, ax = axs[0][0])
axs[0][0].set_title('Age distribution')
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.Fare, palette = {0 : 'black', 1 : 'purple'}, ax = axs[0][1])
axs[0][1].set_title('Fare distribution')

sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.TicketNumbers, palette = {0 : 'black', 1 : 'purple'}, ax = axs[1][0])
axs[1][0].set_title('TicketNumbers distribution')
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.CabinNumbers, palette = {0 : 'black', 1 : 'purple'}, ax = axs[1][1])
axs[1][1].set_title('CabinNumbers distribution')
plt.tight_layout()

In [None]:
plt.figure(figsize = (16,6))
sns.heatmap(train_cleaning_target_cleaned.corr(),
            annot = True,
            fmt = '.2f',
            square = True,
            cmap = "Purples_r",
            mask = np.triu(train_cleaning_target_cleaned.corr()))

In [None]:
to_drop = ['Name',
           'Ticket',
           'Cabin']

train_test_cleaned = train_test_cleaning.drop(to_drop, axis = 1).copy()
train_test_cleaned

In [None]:
label_cols = ['TicketLetters', 'Sex', 'Pclass', 'TicketIsNumeric', 'FamilySize']#'Surname', 
onehot_cols = ['CabinLetter', 'Embarked']
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'TicketNumbers', 'CabinNumbers']#'Pclass'

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# One-hot encoding
train_test_onehot = pd.get_dummies(train_test_cleaned[onehot_cols])
X_train_full_onehot, X_test_onehot = train_test_onehot.xs('train').reset_index(), train_test_onehot.xs('test').reset_index()

X_train_full, X_test = train_test_cleaned.xs('train'), train_test_cleaned.xs('test')
# Label encoding
X_train_full_labeled = pd.DataFrame()
X_test_labeled = pd.DataFrame()
for col in label_cols:
    encoder = LabelEncoder()
    encoder.fit(X_train_full[col])
    
    encoded_train = pd.Series(encoder.transform(X_train_full[col]), name = col)
    X_train_full_labeled = pd.concat([X_train_full_labeled, encoded_train], axis = 1)
    
    encoded_test = pd.Series(encoder.transform(X_test[col]), name = col)
    X_test_labeled = pd.concat([X_test_labeled, encoded_test], axis = 1)
# Numerical features scaling
scaler = StandardScaler()
scaler.fit(X_train_full[numerical_cols])
X_train_full_scaled = pd.DataFrame(scaler.transform(X_train_full[numerical_cols]), columns = numerical_cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_cols]), columns = numerical_cols)
# Concatenating it all together
X_train_full = pd.concat([X_train_full_onehot, 
                          X_train_full_labeled, 
                          X_train_full_scaled], axis = 1)
X_train_full.set_index('PassengerId', inplace = True)
X_test = pd.concat([X_test_onehot, 
                    X_test_labeled, 
                    X_test_scaled], axis = 1)
X_test.set_index('PassengerId', inplace = True)
X_train_full

In [None]:
X_test

In [None]:
y_train_full = target
y_train_full

# 5. Creating and evaluating a model

# 5.1 Neural network

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

tf.random.set_seed(1)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience = 10,#100 80 40 20 10
    min_delta = 0.001,
    restore_best_weights = True,
)

k = 5
history = pd.DataFrame(columns = ['ValAccuracy', 'TrainAccuracy', 'StoppedEpoch'], index = range(k))

for fold in range(k):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.8)
    
    model = keras.Sequential([layers.BatchNormalization(input_shape = [X_train_full.shape[1]]),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 1, activation = 'sigmoid')])
    
    model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['binary_accuracy'])
    
    model.fit(X_train, y_train,
              validation_data = (X_valid, y_valid),
              batch_size = 512,
              epochs = 1000,
              callbacks = [early_stopping],
              verbose = 0,)
    
    history.loc[fold, 'ValAccuracy'] = model.history.history['val_binary_accuracy']
    history.loc[fold, 'TrainAccuracy'] = model.history.history['binary_accuracy']
    history.loc[fold, 'StoppedEpoch'] = early_stopping.stopped_epoch

In [None]:
fig, axs = plt.subplots(k, figsize = (16, 32))
fig.suptitle(f'Train and validation accuracy for {k}-fold validation\n\n', fontsize = 16)
for i in range(k):
    sns.lineplot(data = history.loc[i, 'ValAccuracy'], ax = axs[i], color = 'red')
    sns.lineplot(data = history.loc[i, 'TrainAccuracy'], ax = axs[i], color = 'blue')
    axs[i].legend(['Validation', 'Train'])
    axs[i].set_ylabel('Accuracy')
    axs[i].set_xlabel('Epochs')
    
plt.tight_layout()

In [None]:
history.StoppedEpoch.mean()

In [None]:
model = keras.Sequential([layers.BatchNormalization(input_shape = [X_train_full.shape[1]]),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 1, activation = 'sigmoid')])

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_accuracy'])

history = model.fit(X_train_full, y_train_full,
                    batch_size = 512,
                    epochs = 33,
                    verbose = 0)

In [None]:
print(f"Train mean: {np.mean(history.history['binary_accuracy'])}"+"\n"+
      f"Train std: {np.std(history.history['binary_accuracy'])}")

In [None]:
predictions_nn = model.predict(X_test)

In [None]:
predictions_nn[predictions_nn > 0.5] = 1
predictions_nn[predictions_nn <= 0.5] = 0

In [None]:
predictions_nn[predictions_nn == 1].size

In [None]:
predictions_nn[predictions_nn == 0].size

In [None]:
predictions_nn.flatten().astype('int64')

# 5.2 Other models

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
def test_estimators(X, y, estimators, labels, cv):
    ''' 
    A function for testing multiple estimators.
    It takes: full train data and target, list of estimators, 
              list of labels or names of estimators,
              cross validation splitting strategy;
    And it returns: a DataFrame of table with results of tests
    '''
    result_table = pd.DataFrame()

    row_index = 0
    for est, label in zip(estimators, labels):

        est_name = label
        result_table.loc[row_index, 'Model Name'] = est_name

        cv_results = cross_validate(est,
                                    X,
                                    y,
                                    cv = cv,
                                    n_jobs = -1)

        result_table.loc[row_index, 'Test accuracy'] = cv_results['test_score'].mean()
        result_table.loc[row_index, 'Test Std'] = cv_results['test_score'].std()
        result_table.loc[row_index, 'Fit Time'] = cv_results['fit_time'].mean()

        row_index += 1

    result_table.sort_values(by=['Test accuracy'], ascending = False, inplace = True)

    return result_table

Taking a sample to save some time.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.1)
y_train

In [None]:
lr = LogisticRegression()
dt = DecisionTreeClassifier(random_state = 1)
rf = RandomForestClassifier()
# xgb = XGBClassifier()
lgbm = LGBMClassifier()
cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent')

estimators = [lr,
              dt,
              rf,
              lgbm, 
              cb]
#               xgb]

labels = ['LogRegression',
          'DecisionTree',
          'RandomForest',
          'LGBM',
          'CatBoost']
#           'XGB']

results = test_estimators(X_train, y_train, estimators, labels, cv = 10)
results.style.background_gradient(cmap = 'Purples')

In [None]:
cb.fit(X_train_full, y_train_full)
lgbm.fit(X_train_full, y_train_full)

In [None]:
predictions_cb = cb.predict(X_test)
predictions_lgbm = lgbm.predict(X_test)

# 5.3 Voting ensemble

In [None]:
submission = pd.DataFrame()

In [None]:
submission['PassengerId'] = X_test.index
submission['pr_nn'] = predictions_nn.flatten().astype('int64')
submission['pr_cb'] = predictions_cb
submission['pr_lgbm'] = predictions_lgbm

In [None]:
submission[[col for col in submission.columns if col.startswith('pr_')]].sum(axis = 1).value_counts()

In [None]:
submission['Survived'] = (submission[[col for col in submission.columns if col.startswith('pr_')]].sum(axis=1) >= 2).astype(int)
submission

In [None]:
submission[['PassengerId', 'Survived']].to_csv('submission.csv', index = False)