# Introduction
Hello!

In this kernel you will find my full data science workflow of "Tabular Playground Series - Apr 2021" competition.
I'm here to learn and improve, so by all means feel free to criticize or suggest anything in a comments section down below, I would really appreciate it! :)

Also, I would like to recommend [this amazing notebook](https://www.kaggle.com/jitendramanwani/tps-april-2021-eda-viz-insights-model-end-2-end) to you.

# Table of contents:

1. Meeting our data

2. Visualization and data analysis

3. Data cleaning

4. Feature engineering and encoding

    4.1 Full data set

    4.2 Separating male and female sets

5. Creating and evaluating models

    5.1 Neural network

    5.2 Other models
    
    5.3 Parameter tuning with Optuna
    
    5.4 Creating tuned models
    
    5.5 Voting ensemble
    
    5.6 Stacking ensemble

# 1. Meeting our data

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', index_col = 'PassengerId')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv', index_col = 'PassengerId')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.dtypes.unique()

In [None]:
test.dtypes.unique()

In [None]:
train.select_dtypes(include = ['object']).describe()

In [None]:
train.drop('Survived', axis = 1).select_dtypes(exclude = ['object']).describe()

In [None]:
target = train.Survived.copy()
target

In [None]:
target.isna().any()

In [None]:
target.loc[target == 1].size / target.size

In [None]:
target.describe()

In [None]:
train.drop('Survived', axis = 1).columns.equals(test.columns)

# 2. Visualization and data analysis

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_style('whitegrid')

In [None]:
plt.figure(figsize = (16, 6))
sns.countplot(x = train.Survived, palette = 'Purples_r')

In [None]:
def plot_grid(data, fig_size, grid_size, plot_type, target = ''):
    """
    Custom function for plotting grid of plots.
    It takes: DataFrame of data, size of a grid, type of plots, string name of target variable;
    And it outputs: grid of plots.
    """
    fig = plt.figure(figsize = fig_size)
    if plot_type == 'histplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.histplot(data[column_name], kde = True, color = 'blueviolet', stat = 'count')
    if plot_type == 'boxplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.boxplot(x = data[column_name], color = 'blueviolet')
    if plot_type == 'countplot':
        target = data[target]
        for i, column_name in enumerate(data.drop(target.name, axis = 1).columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.countplot(x = data[column_name], hue = target, palette = 'Purples_r')
            plot.legend(loc = 'upper right', title = target.name)
    plt.tight_layout()

In [None]:
plot_grid(train.drop('Survived', axis = 1), (16, 6), (2,3), 'histplot')

In [None]:
pd.pivot_table(train, index = 'Survived', values = ['Age', 'SibSp', 'Parch', 'Fare', 'Pclass'], aggfunc = 'mean')

In [None]:
plot_grid(train.select_dtypes(exclude = 'object').drop(['Fare', 'Age'], axis = 1), (16, 6), (1, 3), 'countplot', 'Survived')

In [None]:
print(f"{pd.pivot_table(train, index = 'Survived', columns = 'Pclass', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'SibSp', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'Parch', values = 'Name', aggfunc ='count')}")

In [None]:
plt.figure(figsize = (16, 6))
sns.heatmap(train.corr(), 
            annot = True,
            fmt = '.2f',
            square = True,
            cmap = "Purples_r", 
            mask = np.triu(train.corr()))

In [None]:
plot_grid(train.drop('Survived', axis = 1), (16, 6), (2,3), 'boxplot')

In [None]:
plot_grid(pd.concat([train.select_dtypes(include = 'object').drop(['Name', 'Ticket', 'Cabin'], axis = 1), target], axis = 1), (16, 6), (2,1), 'countplot', 'Survived')

In [None]:
print(f"{pd.pivot_table(train, index = 'Survived', columns = 'Sex', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train, index = 'Survived', columns = 'Embarked', values = 'Name', aggfunc ='count')}")

In [None]:
train.select_dtypes(include = 'object').nunique().sort_values(ascending = False)

# 3. Data cleaning

In [None]:
train_test = pd.concat([train.drop('Survived', axis = 1), test], keys = ['train', 'test'], axis = 0)
missing_values = pd.concat([train_test.isna().sum(),
                            (train_test.isna().sum() / train_test.shape[0]) * 100], axis = 1, 
                            keys = ['Values missing', 'Percent of missing'])
missing_values.loc[missing_values['Percent of missing'] > 0].sort_values(ascending = False, by = 'Percent of missing').style.background_gradient('Purples')

In [None]:
train_cleaning = train.drop('Survived', axis = 1).copy()
test_cleaning = test.copy()

train_cleaning['Cabin'].fillna('none', inplace = True)
test_cleaning['Cabin'].fillna('none', inplace = True)

train_cleaning['Ticket'].fillna('none', inplace = True)
test_cleaning['Ticket'].fillna('none', inplace = True)

train_cleaning.loc[train_cleaning.Sex == 'male', 'Age'] = train_cleaning.loc[train_cleaning.Sex == 'male', 'Age'].fillna(train_cleaning.loc[train_cleaning.Sex == 'male', 'Age'].median())
train_cleaning.loc[train_cleaning.Sex == 'female', 'Age'] = train_cleaning.loc[train_cleaning.Sex == 'female', 'Age'].fillna(train_cleaning.loc[train_cleaning.Sex == 'female', 'Age'].median())
test_cleaning.loc[test_cleaning.Sex == 'male', 'Age'] = test_cleaning.loc[test_cleaning.Sex == 'male', 'Age'].fillna(train_cleaning.loc[train_cleaning.Sex == 'male', 'Age'].median())
test_cleaning.loc[test_cleaning.Sex == 'female', 'Age'] = test_cleaning.loc[test_cleaning.Sex == 'female', 'Age'].fillna(train_cleaning.loc[train_cleaning.Sex == 'female', 'Age'].median())

train_cleaning.loc[train_cleaning.Sex == 'male', 'Embarked'] = train_cleaning.loc[train_cleaning.Sex == 'male'].groupby('Pclass').Embarked.apply(lambda x: x.fillna(x.mode()[0]))
train_cleaning.loc[train_cleaning.Sex == 'female', 'Embarked'] = train_cleaning.loc[train_cleaning.Sex == 'female'].groupby('Pclass').Embarked.apply(lambda x: x.fillna(x.mode()[0]))

train_cleaning.loc[train_cleaning.Sex == 'male', 'Fare'] = train_cleaning.loc[train_cleaning.Sex == 'male'].groupby('Pclass').Fare.apply(lambda x: x.fillna(x.median()))
train_cleaning.loc[train_cleaning.Sex == 'female', 'Fare'] = train_cleaning.loc[train_cleaning.Sex == 'female'].groupby('Pclass').Fare.apply(lambda x: x.fillna(x.median()))
for i in train_cleaning.Pclass.unique():
    test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'male'), 'Embarked'] = test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'male'), 'Embarked'].fillna(train_cleaning.loc[(train_cleaning.Pclass == i) & (train_cleaning.Sex == 'male')].Embarked.mode()[0])
    test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'female'), 'Embarked'] = test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'female'), 'Embarked'].fillna(train_cleaning.loc[(train_cleaning.Pclass == i) & (train_cleaning.Sex == 'female')].Embarked.mode()[0])
    test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'male'), 'Fare'] = test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'male'), 'Fare'].fillna(train_cleaning.loc[(train_cleaning.Pclass == i) & (train_cleaning.Sex == 'male')].Fare.mode()[0])
    test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'female'), 'Fare'] = test_cleaning.loc[(test_cleaning.Pclass == i) & (test_cleaning.Sex == 'female'), 'Fare'].fillna(train_cleaning.loc[(train_cleaning.Pclass == i) & (train_cleaning.Sex == 'female')].Fare.mode()[0])
    

# train_cleaning['Embarked'].fillna('none', inplace = True)
# test_cleaning['Embarked'].fillna('none', inplace = True)

In [None]:
train_cleaning.isnull().sum().max() + test_cleaning.isnull().sum().max()

# 4. Feature engineering and encoding

In [None]:
train_test_cleaning = pd.concat([train_cleaning, test_cleaning], keys = ['train', 'test'], axis = 0)
train_test_cleaning

In [None]:
train_test_cleaning['CabinLetter'] = train_test_cleaning.Cabin.str.split().apply(lambda x: x[-1][0].strip().lower() if x[0] != 'none' else np.nan)
train_test_cleaning['TicketLetters'] = train_test_cleaning.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.', '').replace('/', '').lower() 
                                                                        if len(x.split(' ')[:-1]) > 0 else np.nan)
# train_test_cleaning['CabinIsNull'] = train_test_cleaning.Cabin.apply(lambda x: 1 if x == 'none' else 0)
# train_test_cleaning['TicketIsNull'] = train_test_cleaning.Ticket.apply(lambda x: 1 if x == 'none' else 0)
# train_test_cleaning['EmbarkedIsNull'] = train_test_cleaning.Embarked.apply(lambda x: 1 if x == 'none' else 0)

In [None]:
train_cleaning_new = train_test_cleaning.xs('train').copy()
test_cleaning_new = train_test_cleaning.xs('test').copy()

train_cleaning_new.loc[train_cleaning_new.Sex == 'male'].groupby('Pclass').CabinLetter.apply(lambda x: x.value_counts().index[0])

In [None]:
train_cleaning_new.loc[train_cleaning_new.Sex == 'female'].groupby('Pclass').CabinLetter.apply(lambda x: x.value_counts().index[0])

In [None]:
train_cleaning_new.loc[train_cleaning_new.Sex == 'male'].groupby('Pclass').TicketLetters.apply(lambda x: x.value_counts().index[0])

In [None]:
train_cleaning_new.loc[train_cleaning_new.Sex == 'female'].groupby('Pclass').TicketLetters.apply(lambda x: x.value_counts().index[0])

In [None]:
# train_cleaning_new['CabinLetter'] = train_cleaning_new.groupby('Pclass')['CabinLetter'].apply(lambda x: x.fillna(x.mode()[0]))

train_cleaning_new.loc[train_cleaning_new.Sex == 'male', 'CabinLetter'] = train_cleaning_new.loc[train_cleaning_new.Sex == 'male'].groupby('Pclass')['CabinLetter'].apply(lambda x: x.fillna(x.mode()[0]))
train_cleaning_new.loc[train_cleaning_new.Sex == 'female', 'CabinLetter'] = train_cleaning_new.loc[train_cleaning_new.Sex == 'female'].groupby('Pclass')['CabinLetter'].apply(lambda x: x.fillna(x.mode()[0]))

train_cleaning_new.loc[train_cleaning_new.Sex == 'male', 'TicketLetters'] = train_cleaning_new.loc[train_cleaning_new.Sex == 'male'].groupby('Pclass')['TicketLetters'].apply(lambda x: x.fillna(x.mode()[0]))
train_cleaning_new.loc[train_cleaning_new.Sex == 'female', 'TicketLetters'] = train_cleaning_new.loc[train_cleaning_new.Sex == 'female'].groupby('Pclass')['TicketLetters'].apply(lambda x: x.fillna(x.mode()[0]))

for i in train_cleaning_new.Pclass.unique():
    test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'male'), 'CabinLetter'] = test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'male'), 'CabinLetter'].fillna(train_cleaning_new.loc[(train_cleaning_new.Pclass == i) & (train_cleaning_new.Sex == 'male')].CabinLetter.mode()[0])
    test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'female'), 'CabinLetter'] = test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'female'), 'CabinLetter'].fillna(train_cleaning_new.loc[(train_cleaning_new.Pclass == i) & (train_cleaning_new.Sex == 'female')].CabinLetter.mode()[0])
    
    test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'male'), 'TicketLetters'] = test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'male'), 'TicketLetters'].fillna(train_cleaning_new.loc[(train_cleaning_new.Pclass == i) & (train_cleaning_new.Sex == 'male')].TicketLetters.mode()[0])
    test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'female'), 'TicketLetters'] = test_cleaning_new.loc[(test_cleaning_new.Pclass == i) & (test_cleaning_new.Sex == 'female'), 'TicketLetters'].fillna(train_cleaning_new.loc[(train_cleaning_new.Pclass == i) & (train_cleaning_new.Sex == 'female')].TicketLetters.mode()[0])

    
train_test_cleaning = pd.concat([train_cleaning_new, test_cleaning_new], keys = ['train', 'test'], axis = 0)

In [None]:
train.loc[:, ['Fare', 'Age']].select_dtypes(exclude = ['object']).describe()

In [None]:
train_test_cleaning['CabinNumbers'] = train_test_cleaning.Cabin.apply(lambda x: int(x[1:]) if x != 'none' else 0)

train_test_cleaning['TicketNumbers'] = train_test_cleaning.Ticket.apply(lambda x: int(x) if x.isnumeric() else 0 if x == 'none'
                                                                        else int(x.split(' ')[-1]) if (x.split(' ')[-1]).isnumeric() else 0)
train_test_cleaning['TicketNumbersGroup'] = train_test_cleaning['TicketNumbers'].apply(lambda x: 0 if (x == 0)
                                                                                       else 1 if (x > 0 and x <= 100000)
                                                                                       else 2 if (x > 100000 and x <= 260000)                                                                    
                                                                                       else 3 if (x > 260000 and x <= 380000)
                                                                                       else 4 if (x > 380000 and x <= 538000)
                                                                                       else 5)

train_test_cleaning['TicketIsNumeric'] = train_test_cleaning.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)

train_test_cleaning['FamilySize'] = train_test_cleaning.SibSp + train_test_cleaning.Parch + 1
train_test_cleaning['FamilySize'] = train_test_cleaning['FamilySize'].apply(lambda x: 0 if (x == 1)
                                                                            else 1 if (x == 2 or x == 3)
                                                                            else 2)
train_test_cleaning['IsAlone'] = train_test_cleaning['FamilySize'].apply(lambda x: 1 if (x == 1) else 0)

# train_test_cleaning['AgeGroup'] = train_test_cleaning['Age'].apply(lambda x: 0 if (x < 25) 
#                                                                    else 1 if (x >= 25 and x < 39)                                                                    
#                                                                    else 2 if (x >= 39 and x < 53)
#                                                                    else 3)

train_test_cleaning['AgeGroup'] = train_test_cleaning['Age'].apply(lambda x: 0 if (x < 10) 
                                                                   else 1 if (x >= 10 and x < 20)                                                                    
                                                                   else 2 if (x >= 20 and x < 30)
                                                                   else 3 if (x >= 30 and x < 40)
                                                                   else 4 if (x >= 40 and x < 50)
                                                                   else 5 if (x >= 50 and x < 60)
                                                                   else 6 if (x >= 60 and x < 70)
                                                                   else 7 if (x >= 70 and x < 80)
                                                                   else 8)

train_test_cleaning['FareGroup'] = train_test_cleaning['Fare'].apply(lambda x: 0 if (x < 10.04) 
                                                                     else 1 if (x >= 10.04 and x < 24.46)  
                                                                     else 2 if (x >= 24.46 and x < 33.5)                                                            
                                                                     else 3)

train_test_cleaning['TicketLettersGroup'] = train_test_cleaning.TicketLetters.apply(lambda x: 0 if x == 'pc' 
                                                                                    else 3 if x in ['stono', 'stono2', 'sotono2', 'stonoq', 'aq3']
                                                                                    else 2 if x in ['sotonoq', 'fa', 'a5', 'ca', 'fcc', 'scow', 'casoton', 'a4', 'wc', 'swpp', 'c']                                                                
                                                                                    else 1)

# train_test_cleaning['Surname'] = train_test_cleaning['Name'].apply(lambda x: x.split(',')[0].lower())

train_test_cleaning['Embarked'] = train_test_cleaning['Embarked'].str.lower()

In [None]:
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p

lamb = boxcox_normmax(train_test_cleaning.loc['train', 'Fare'] + 1)
train_test_cleaning.loc['train', 'Fare'] = boxcox1p(train_test_cleaning.loc['train', 'Fare'], lamb).values
train_test_cleaning.loc['test', 'Fare'] = boxcox1p(train_test_cleaning.loc['test', 'Fare'], lamb).values

lamb = boxcox_normmax(train_test_cleaning.loc['train', 'CabinNumbers'] + 1)
train_test_cleaning.loc['train', 'CabinNumbers'] = boxcox1p(train_test_cleaning.loc['train', 'CabinNumbers'], lamb).values
train_test_cleaning.loc['test', 'CabinNumbers'] = boxcox1p(train_test_cleaning.loc['test', 'CabinNumbers'], lamb).values

lamb = boxcox_normmax(train_test_cleaning.loc['train', 'TicketNumbers'] + 1)
train_test_cleaning.loc['train', 'TicketNumbers'] = boxcox1p(train_test_cleaning.loc['train', 'TicketNumbers'], lamb).values
train_test_cleaning.loc['test', 'TicketNumbers'] = boxcox1p(train_test_cleaning.loc['test', 'TicketNumbers'], lamb).values

In [None]:
train_test_cleaning

In [None]:
train_cleaning_target_cleaned = pd.concat([train_test_cleaning.xs('train'), target], axis = 1)
train_cleaning_target_cleaned

In [None]:
print(f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'CabinLetter', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', values = 'TicketNumbers', aggfunc = (lambda x: x.mode()[0]))} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketIsNumeric', values = 'Name', aggfunc ='count')} \n\n" +
      
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'AgeGroup', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'FareGroup', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketLettersGroup', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketNumbersGroup', values = 'Name', aggfunc ='count')} \n\n" +
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'IsAlone', values = 'Name', aggfunc ='count')} \n\n" +
      
      f"{pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'FamilySize', values = 'Name', aggfunc ='count')}")

In [None]:
pd.pivot_table(train_cleaning_target_cleaned, index = 'Survived', columns = 'TicketLetters', values = 'Name', aggfunc = 'count')

In [None]:
train_cleaning_target_cleaned.select_dtypes(include = 'object').nunique().sort_values(ascending = False)

In [None]:
plot_grid(train_cleaning_target_cleaned.loc[:,['Age', 'Fare', 'TicketNumbers', 'CabinNumbers']], (16, 6), (2, 3), 'histplot')

In [None]:
plot_grid(train_cleaning_target_cleaned.drop(['Name', 'Ticket', 'Cabin', 'Age', 'Fare', 'TicketNumbers', 'TicketLetters', 'CabinNumbers'],
                                             axis = 1), (16, 10), (5, 3), 'countplot', 'Survived')

In [None]:
pd.crosstab(index = train_cleaning_target_cleaned.TicketLetters , columns= train_cleaning_target_cleaned.Survived, normalize = 'index' ). \
sort_values(by = 1).plot.bar(figsize = (15, 7), stacked = True, color = {0: 'grey', 
                                                                         1: 'purple'})
plt.axhline(y = 0.8, color = 'r', linestyle = '-')
plt.axhline(y = 0.65, color = 'g', linestyle = '-')

In [None]:
from matplotlib import ticker
# 'Age', 'Fare', 'TicketNumbers', 'CabinNumbers'
fig, axs = plt.subplots(4, 1, figsize = (16, 16))
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.Age, palette = {0 : 'black', 1 : 'purple'}, ax = axs[0])
axs[0].set_title('Age distribution')
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.Fare, palette = {0 : 'black', 1 : 'purple'}, ax = axs[1])
axs[1].xaxis.set_major_locator(ticker.MultipleLocator(25))
axs[1].xaxis.set_major_formatter(ticker.ScalarFormatter())
axs[1].set_title('Fare distribution')
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.TicketNumbers, palette = {0 : 'black', 1 : 'purple'}, ax = axs[2])
axs[2].set_title('TicketNumbers distribution')
sns.histplot(hue = train_cleaning_target_cleaned.Survived, x = train_cleaning_target_cleaned.CabinNumbers, palette = {0 : 'black', 1 : 'purple'}, ax = axs[3])
axs[3].set_title('CabinNumbers distribution')
plt.tight_layout()

In [None]:
plt.figure(figsize = (16,10))
sns.heatmap(train_cleaning_target_cleaned.corr(),
            annot = True,
            annot_kws = {"size": 13},
            fmt = '.2f',
            square = True,
            cmap = "Purples_r",
            mask = np.triu(train_cleaning_target_cleaned.corr()))

In [None]:
to_drop = ['Name',
           'Ticket',
           'Cabin']

train_test_cleaned = train_test_cleaning.drop(to_drop, axis = 1).copy()
train_test_cleaned

In [None]:
label_cols = ['AgeGroup', 'FamilySize', 'TicketLettersGroup', 'Pclass', 'IsAlone', 'TicketIsNumeric', 'Sex']
onehot_cols = ['CabinLetter', 'Embarked']
numerical_cols = ['SibSp', 'Parch', 'Fare', 'CabinNumbers', 'TicketNumbers']

# 4.1 Full data set

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# One-hot encoding
train_test_onehot = pd.get_dummies(train_test_cleaned[onehot_cols])
X_train_full_onehot, X_test_onehot = train_test_onehot.xs('train').reset_index(), train_test_onehot.xs('test').reset_index()

X_train_full, X_test = train_test_cleaned.xs('train'), train_test_cleaned.xs('test')
# Label encoding
X_train_full_labeled = pd.DataFrame()
X_test_labeled = pd.DataFrame()
for col in label_cols:
    encoder = LabelEncoder()
    encoder.fit(X_train_full[col])
    
    encoded_train = pd.Series(encoder.transform(X_train_full[col]), name = col)
    X_train_full_labeled = pd.concat([X_train_full_labeled, encoded_train], axis = 1)
    
    encoded_test = pd.Series(encoder.transform(X_test[col]), name = col)
    X_test_labeled = pd.concat([X_test_labeled, encoded_test], axis = 1)
# Numerical features scaling
scaler = StandardScaler()
scaler.fit(X_train_full[numerical_cols])
X_train_full_scaled = pd.DataFrame(scaler.transform(X_train_full[numerical_cols]), columns = numerical_cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_cols]), columns = numerical_cols)
# Concatenating it all together
X_train_full = pd.concat([X_train_full_onehot, 
                          X_train_full_labeled, 
                          X_train_full_scaled], axis = 1)
X_train_full.set_index('PassengerId', inplace = True)
X_test = pd.concat([X_test_onehot, 
                    X_test_labeled, 
                    X_test_scaled], axis = 1)
X_test.set_index('PassengerId', inplace = True)
X_train_full

In [None]:
X_test

In [None]:
y_train_full = target
y_train_full

# 4.2 Separating male and female sets

It didn't improve accuracy, left it just as a reference.

In [None]:
# train_test_cleaned_male = train_test_cleaned.loc[train_test_cleaned.Sex == 'male'].copy()
# train_test_cleaned_female = train_test_cleaned.loc[train_test_cleaned.Sex == 'female'].copy()

In [None]:
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# # Male
# # One-hot encoding
# train_test_onehot = pd.get_dummies(train_test_cleaned_male[onehot_cols])
# X_train_full_onehot, X_test_onehot = train_test_onehot.xs('train').reset_index(), train_test_onehot.xs('test').reset_index()

# X_train_full, X_test = train_test_cleaned_male.xs('train'), train_test_cleaned_male.xs('test')
# # Label encoding
# X_train_full_labeled = pd.DataFrame()
# X_test_labeled = pd.DataFrame()
# for col in label_cols:
#     encoder = LabelEncoder()
#     encoder.fit(X_train_full[col])
    
#     encoded_train = pd.Series(encoder.transform(X_train_full[col]), name = col)
#     X_train_full_labeled = pd.concat([X_train_full_labeled, encoded_train], axis = 1)
    
#     encoded_test = pd.Series(encoder.transform(X_test[col]), name = col)
#     X_test_labeled = pd.concat([X_test_labeled, encoded_test], axis = 1)
# # Numerical features scaling
# scaler = StandardScaler()
# scaler.fit(X_train_full[numerical_cols])
# X_train_full_scaled = pd.DataFrame(scaler.transform(X_train_full[numerical_cols]), columns = numerical_cols)
# X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_cols]), columns = numerical_cols)
# # Concatenating it all together
# X_train_full_male = pd.concat([X_train_full_onehot, 
#                           X_train_full_labeled, 
#                           X_train_full_scaled], axis = 1)
# X_train_full_male.set_index('PassengerId', inplace = True)
# X_test_male = pd.concat([X_test_onehot, 
#                     X_test_labeled, 
#                     X_test_scaled], axis = 1)
# X_test_male.set_index('PassengerId', inplace = True)
# X_train_full_male

In [None]:
# y_train_full_male = target.loc[target.index.isin(X_train_full_male.index)].copy()
# y_train_full_male

In [None]:
# # Female
# # One-hot encoding
# train_test_onehot = pd.get_dummies(train_test_cleaned_female[onehot_cols])
# X_train_full_onehot, X_test_onehot = train_test_onehot.xs('train').reset_index(), train_test_onehot.xs('test').reset_index()

# X_train_full, X_test = train_test_cleaned_female.xs('train'), train_test_cleaned_female.xs('test')
# # Label encoding
# X_train_full_labeled = pd.DataFrame()
# X_test_labeled = pd.DataFrame()
# for col in label_cols:
#     encoder = LabelEncoder()
#     encoder.fit(X_train_full[col])
    
#     encoded_train = pd.Series(encoder.transform(X_train_full[col]), name = col)
#     X_train_full_labeled = pd.concat([X_train_full_labeled, encoded_train], axis = 1)
    
#     encoded_test = pd.Series(encoder.transform(X_test[col]), name = col)
#     X_test_labeled = pd.concat([X_test_labeled, encoded_test], axis = 1)
# # Numerical features scaling
# scaler = StandardScaler()
# scaler.fit(X_train_full[numerical_cols])
# X_train_full_scaled = pd.DataFrame(scaler.transform(X_train_full[numerical_cols]), columns = numerical_cols)
# X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_cols]), columns = numerical_cols)
# # Concatenating it all together
# X_train_full_female = pd.concat([X_train_full_onehot, 
#                           X_train_full_labeled, 
#                           X_train_full_scaled], axis = 1)
# X_train_full_female.set_index('PassengerId', inplace = True)
# X_test_female = pd.concat([X_test_onehot, 
#                     X_test_labeled, 
#                     X_test_scaled], axis = 1)
# X_test_female.set_index('PassengerId', inplace = True)
# X_train_full_female

In [None]:
# y_train_full_female = target.loc[target.index.isin(X_train_full_female.index)].copy()
# y_train_full_female

# 5. Creating and evaluating models

# 5.1 Neural network

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split, StratifiedKFold

tf.random.set_seed(1)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience = 10,#100 80 40 20 10
    min_delta = 0.001,
    restore_best_weights = True,
)

k = 5
kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

history = pd.DataFrame(columns = ['ValAccuracy', 'TrainAccuracy', 'StoppedEpoch'], index = range(k))

for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
    X_train = X_train_full.iloc[train_idx]
    y_train = y_train_full.iloc[train_idx]
    X_valid = X_train_full.iloc[test_idx]
    y_valid = y_train_full.iloc[test_idx]
    
    model = keras.Sequential([layers.BatchNormalization(input_shape = [X_train.shape[1]]),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 1, activation = 'sigmoid')])
    
    model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['binary_accuracy'])
    
    model.fit(X_train, y_train,
              validation_data = (X_valid, y_valid),
              batch_size = 512,
              epochs = 1000,
              callbacks = [early_stopping],
              verbose = 0,)
    
    history.loc[i, 'ValAccuracy'] = model.history.history['val_binary_accuracy']
    history.loc[i, 'TrainAccuracy'] = model.history.history['binary_accuracy']
    history.loc[i, 'StoppedEpoch'] = early_stopping.stopped_epoch

In [None]:
fig, axs = plt.subplots(k, figsize = (16, 32))
fig.suptitle(f'Train and validation accuracy for {k}-fold validation\n\n', fontsize = 16)
for i in range(k):
    sns.lineplot(data = history.loc[i, 'ValAccuracy'], ax = axs[i], color = 'red')
    sns.lineplot(data = history.loc[i, 'TrainAccuracy'], ax = axs[i], color = 'blue')
    axs[i].set_title(f'{i+1} fold')
    axs[i].legend(['Validation', 'Train'])
    axs[i].set_ylabel('Accuracy')
    axs[i].set_xlabel('Epochs')
    
plt.tight_layout()

In [None]:
round(history.StoppedEpoch.mean())

In [None]:
model = keras.Sequential([layers.BatchNormalization(input_shape = [X_train_full.shape[1]]),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 16, activation = 'relu'),
                              layers.Dropout(rate = 0.1),
                              
                              layers.BatchNormalization(),
                              layers.Dense(units = 1, activation = 'sigmoid')])

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_accuracy'])

history = model.fit(X_train_full, y_train_full,
                    batch_size = 512,
                    epochs = round(history.StoppedEpoch.mean()),
                    verbose = 0)

In [None]:
print(f"Train mean: {np.mean(history.history['binary_accuracy'])}"+"\n"+
      f"Train std: {np.std(history.history['binary_accuracy'])}")

In [None]:
predictions_nn = model.predict(X_test)

In [None]:
predictions_nn[predictions_nn > 0.5] = 1
predictions_nn[predictions_nn <= 0.5] = 0

In [None]:
predictions_nn[predictions_nn == 1].size

In [None]:
predictions_nn[predictions_nn == 0].size

In [None]:
predictions_nn.flatten().astype('int64')

# 5.2 Other models

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
def test_estimators(X, y, estimators, labels, cv):
    ''' 
    A function for testing multiple estimators.
    It takes: full train data and target, list of estimators, 
              list of labels or names of estimators,
              cross validation splitting strategy;
    And it returns: a DataFrame of table with results of tests
    '''
    result_table = pd.DataFrame()

    row_index = 0
    for est, label in zip(estimators, labels):

        est_name = label
        result_table.loc[row_index, 'Model Name'] = est_name

        cv_results = cross_validate(est,
                                    X,
                                    y,
                                    cv = cv,
                                    n_jobs = -1)

        result_table.loc[row_index, 'Test accuracy'] = cv_results['test_score'].mean()
        result_table.loc[row_index, 'Test Std'] = cv_results['test_score'].std()
        result_table.loc[row_index, 'Fit Time'] = cv_results['fit_time'].mean()

        row_index += 1

    result_table.sort_values(by=['Test accuracy'], ascending = False, inplace = True)

    return result_table

Taking a sample to save some time.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, 
                                                      y_train_full, 
                                                      stratify = y_train_full,
                                                      train_size = 0.1,
                                                      random_state = 1)
y_train

In [None]:
logreg = LogisticRegression()
dt = DecisionTreeClassifier(random_state = 1)
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()
cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent')
lsvc = LinearSVC()

estimators = [logreg,
              dt,
              rf,
              lgbm, 
              cb,
              lsvc,
              xgb]

labels = ['LogRegression',
          'DecisionTree',
          'RandomForest',
          'LGBM',
          'CatBoost',
          'LSVC',
          'XGB']

results = test_estimators(X_train_full, y_train_full, estimators, labels, cv = 5)
results.style.background_gradient(cmap = 'Purples')

# 5.3 Parameter tuning with Optuna

In [None]:
import optuna
from optuna.trial import TrialState

import keras.optimizers

from sklearn.metrics import accuracy_score

from xgboost import DMatrix, cv

def define_nn(trial):    
    model = keras.Sequential()
    
    n_layers = trial.suggest_int('n_layers', 1, 3)
    
    for i in range(n_layers):
        units = trial.suggest_categorical(f'units_{i}', [8, 16, 32, 64, 128])
        rate = trial.suggest_float(f'rate_{i}', 0.2, 0.5)
        if (i == 0):
            model.add(layers.BatchNormalization(input_shape = [X_train_full.shape[1]]))
            model.add(layers.Dense(units = units, activation = 'relu'))
            model.add(layers.Dropout(rate = rate))
        else:
            model.add(layers.BatchNormalization())
            model.add(layers.Dense(units = units, activation = 'relu'))
            model.add(layers.Dropout(rate = rate))
    
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(units = 1, activation = 'sigmoid'))
    
    return model

def define_cb(trial):    
    params = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, 50),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log = True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log = True),
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 20),
        'random_strength': trial.suggest_float('random_strength', 0.05, 1, log = True)
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    model = CatBoostClassifier(**params)
    
    return model

def define_lgbm(trial):    
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        
        'max_depth': trial.suggest_int('max_depth', 2, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, 50),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log = True),
    }

    model = LGBMClassifier(**params)
    
    return model

def define_logreg(trial):    
    params = {
        'verbose': 0,
        'solver': 'saga',
        'tol': trial.suggest_float('tol', 1e-5, 1e-1, log = True),
        'C': trial.suggest_float('C', 1e-10, 1e10, log = True),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'max_iter': trial.suggest_int('max_iter', 50, 2000, 50),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none']),
        'l1_ratio': trial.suggest_float('l1_ratio', 1e-5, 1, log = True),
    }

    model = LogisticRegression(**params)
    
    return model

def define_lsvc(trial):    
    params = {
        'verbose': 0,
        'tol': trial.suggest_float('tol', 1e-5, 1e-1, log = True),
        'C': trial.suggest_float('C', 1e-10, 1e10, log = True),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'max_iter': trial.suggest_int('max_iter', 1000, 3000, 50),
    }

    model = LinearSVC(**params)
    
    return model

def define_rf(trial):    
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'max_features': trial.suggest_int('max_features', 5, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000, 25),
    }

    model = RandomForestClassifier(**params)
    
    return model

def objective(trial, model, X_train_full, y_train_full):
    if (model == 'nn'):
        
#       Straight forward option

        X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
    
        model = define_nn(trial)
        optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop', 'SGD'])
        lr = trial.suggest_float('lr', 1e-5, 1e-1, log = True)
        optimizer = getattr(keras.optimizers, optimizer_name)(model.optimizer, lr = lr)

        model.compile(optimizer = optimizer,
                      loss = 'binary_crossentropy',
                      metrics = ['binary_accuracy'])

        epochs = trial.suggest_int('epochs', 10, 1000)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512, 1024])
        model.fit(X_train, y_train,
                  batch_size = batch_size,
                  epochs = epochs,
                  verbose = 0)
        results = model.evaluate(X_valid, 
                                 y_valid,
                                 batch_size = batch_size,)
        
        return results[1]

#       Cross-validation option

#         k = 5
#         kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)
#         history = []
#         for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
#             X_train = X_train_full.iloc[train_idx]
#             y_train = y_train_full.iloc[train_idx]
#             X_valid = X_train_full.iloc[test_idx]
#             y_valid = y_train_full.iloc[test_idx]
            
#             model = define_nn(trial)
#             optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop', 'SGD'])
#             lr = trial.suggest_float('lr', 1e-5, 1e-1, log = True)
#             optimizer = getattr(keras.optimizers, optimizer_name)(model.optimizer, lr = lr)
        
#             model.compile(optimizer = optimizer,
#                           loss = 'binary_crossentropy',
#                           metrics = ['binary_accuracy'])

#             epochs = trial.suggest_int('epochs', 10, 1000)
#             batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512, 1024])
#             model.fit(X_train, y_train,
#                       batch_size = batch_size,
#                       epochs = epochs,
#                       verbose = 0)
#             results = model.evaluate(X_valid, 
#                                      y_valid,
#                                      batch_size = batch_size,)
#             history.append(results[1])

#             trial.report(np.mean(history), i)
#             # Handle pruning based on the intermediate value.
#             if trial.should_prune():
#                 raise optuna.exceptions.TrialPruned()

#         return np.mean(history)
    
    elif (model == 'cb'):
        
#       Straight forward option
        
#         X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
        
#         model = define_cb(trial)

#         model.fit(X_train, y_train,
#                   verbose = 0)

#         preds = model.predict(X_valid)
#         pred_labels = np.rint(preds)
#         accuracy = accuracy_score(y_valid, pred_labels)

#         trial.report(accuracy, i)
#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         return accuracy

#       Cross-validation option

        k = 5
        kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

        history = []
        for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
            X_train = X_train_full.iloc[train_idx]
            y_train = y_train_full.iloc[train_idx]
            X_valid = X_train_full.iloc[test_idx]
            y_valid = y_train_full.iloc[test_idx]
            
            model = define_cb(trial)
            
            model.fit(X_train, y_train,
                      verbose = 0)

            preds = model.predict(X_valid)
            pred_labels = np.rint(preds)
            accuracy = accuracy_score(y_valid, pred_labels)
            
            history.append(accuracy)

            trial.report(np.mean(history), i)
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return np.mean(history)
    
    elif (model == 'lgbm'):
        
#       Straight forward option
        
#         X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
        
#         model = define_lgbm(trial)

#         model.fit(X_train, y_train,
#                   verbose = 0)

#         preds = model.predict(X_valid)
#         pred_labels = np.rint(preds)
#         accuracy = accuracy_score(y_valid, pred_labels)

#         trial.report(accuracy, i)
#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         return accuracy

#       Cross-validation option

        k = 5
        kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

        history = []
        for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
            X_train = X_train_full.iloc[train_idx]
            y_train = y_train_full.iloc[train_idx]
            X_valid = X_train_full.iloc[test_idx]
            y_valid = y_train_full.iloc[test_idx]
            
            model = define_lgbm(trial)
            
            model.fit(X_train, y_train,
                      verbose = 0)

            preds = model.predict(X_valid)
            pred_labels = np.rint(preds)
            accuracy = accuracy_score(y_valid, pred_labels)
            
            history.append(accuracy)

            trial.report(np.mean(history), i)
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return np.mean(history)
    
    elif (model == 'logreg'):
        
#       Straight forward option
        
#         X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
        
#         model = define_logreg(trial)

#         model.fit(X_train, y_train,)

#         preds = model.predict(X_valid)
#         pred_labels = np.rint(preds)
#         accuracy = accuracy_score(y_valid, pred_labels)

#         trial.report(accuracy, i)
#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         return accuracy

#       Cross-validation option

        k = 5
        kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

        history = []
        for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
            X_train = X_train_full.iloc[train_idx]
            y_train = y_train_full.iloc[train_idx]
            X_valid = X_train_full.iloc[test_idx]
            y_valid = y_train_full.iloc[test_idx]
            
            model = define_logreg(trial)
            
            model.fit(X_train, y_train,)

            preds = model.predict(X_valid)
            pred_labels = np.rint(preds)
            accuracy = accuracy_score(y_valid, pred_labels)
            
            history.append(accuracy)

            trial.report(np.mean(history), i)
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return np.mean(history)

    elif (model == 'lsvc'):
        
#       Straight forward option
        
#         X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
        
#         model = define_lsvc(trial)

#         model.fit(X_train, y_train,)

#         preds = model.predict(X_valid)
#         pred_labels = np.rint(preds)
#         accuracy = accuracy_score(y_valid, pred_labels)

#         trial.report(accuracy, i)
#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         return accuracy

#       Cross-validation option

        k = 5
        kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

        history = []
        for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
            X_train = X_train_full.iloc[train_idx]
            y_train = y_train_full.iloc[train_idx]
            X_valid = X_train_full.iloc[test_idx]
            y_valid = y_train_full.iloc[test_idx]
            
            model = define_lsvc(trial)
            
            model.fit(X_train, y_train,)

            preds = model.predict(X_valid)
            pred_labels = np.rint(preds)
            accuracy = accuracy_score(y_valid, pred_labels)
            
            history.append(accuracy)

            trial.report(np.mean(history), i)
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return np.mean(history)

    elif (model == 'rf'):
        
#       Straight forward option
        
#         X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, stratify = y_train_full, train_size = 0.25)
        
#         model = define_rf(trial)

#         model.fit(X_train, y_train,)

#         preds = model.predict(X_valid)
#         pred_labels = np.rint(preds)
#         accuracy = accuracy_score(y_valid, pred_labels)

#         trial.report(accuracy, i)
#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         return accuracy

#       Cross-validation option

        k = 5
        kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 1)

        history = []
        for i, (train_idx, test_idx) in enumerate(kf.split(X_train_full, y_train_full)):
            X_train = X_train_full.iloc[train_idx]
            y_train = y_train_full.iloc[train_idx]
            X_valid = X_train_full.iloc[test_idx]
            y_valid = y_train_full.iloc[test_idx]
            
            model = define_rf(trial)
            
            model.fit(X_train, y_train,)

            preds = model.predict(X_valid)
            pred_labels = np.rint(preds)
            accuracy = accuracy_score(y_valid, pred_labels)
            
            history.append(accuracy)

            trial.report(np.mean(history), i)
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return np.mean(history)
    
    elif (model == 'xgb'):

#       Cross-validation option
        dtrain = DMatrix(X_train_full, label = y_train_full)
    
        param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log = True),
        }

        if param["booster"] == "gbtree" or param["booster"] == "dart":
            param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            # minimum child weight, larger the term more conservative the tree.
            param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

        k = 5
        xgb_cv_results = cv(
            params = param,
            dtrain = dtrain,
            num_boost_round = 10000,
            nfold = k,
            stratified = True,
            early_stopping_rounds = 100,
            verbose_eval = False,
        )

        # Set n_estimators as a trial attribute; Accessible via study.trials_dataframe().
        trial.set_user_attr("n_estimators", len(xgb_cv_results))

        # Extract the best score.
        best_score = xgb_cv_results["test-auc-mean"].values[-1]
        return best_score

In [None]:
study_nn = optuna.create_study(direction = 'maximize')
study_nn.optimize(lambda trial: objective(trial, 'nn', X_train_full, y_train_full), n_trials = 100, timeout = 600)

In [None]:
pruned_trials = study_nn.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_nn.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Keras NN study statistics: ")
print("  Number of finished trials: ", len(study_nn.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_nn.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_cb = optuna.create_study(direction = 'maximize')
study_cb.optimize(lambda trial: objective(trial, 'cb', X_train_full, y_train_full), n_trials = 500, timeout = 600)

In [None]:
pruned_trials = study_cb.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_cb.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("CatBoost study statistics: ")
print("  Number of finished trials: ", len(study_cb.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_cb.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_lgbm = optuna.create_study(direction = 'maximize')
study_lgbm.optimize(lambda trial: objective(trial, 'lgbm', X_train_full, y_train_full), n_trials = 500, timeout = 600)

In [None]:
pruned_trials = study_lgbm.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_lgbm.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("LightGBM study statistics: ")
print("  Number of finished trials: ", len(study_lgbm.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_lgbm.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_logreg = optuna.create_study(direction = 'maximize')
study_logreg.optimize(lambda trial: objective(trial, 'logreg', X_train_full, y_train_full), n_trials = 100, timeout = 600)

In [None]:
pruned_trials = study_logreg.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_logreg.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("LogRegression study statistics: ")
print("  Number of finished trials: ", len(study_logreg.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_logreg.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_lsvc = optuna.create_study(direction = 'maximize')
study_lsvc.optimize(lambda trial: objective(trial, 'lsvc', X_train_full, y_train_full), n_trials = 100, timeout = 600)

In [None]:
pruned_trials = study_lsvc.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_lsvc.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("LinearSVC study statistics: ")
print("  Number of finished trials: ", len(study_lsvc.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_lsvc.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_rf = optuna.create_study(direction = 'maximize')
study_rf.optimize(lambda trial: objective(trial, 'rf', X_train_full, y_train_full), n_trials = 100, timeout = 600)

In [None]:
pruned_trials = study_rf.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_rf.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("RandomForest study statistics: ")
print("  Number of finished trials: ", len(study_rf.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_rf.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study_xgb = optuna.create_study(direction = 'maximize')
study_xgb.optimize(lambda trial: objective(trial, 'xgb', X_train_full, y_train_full), n_trials = 20, timeout = 600)

In [None]:
print("Number of finished trials: ", len(study_xgb.trials))
print("Best trial:")
trial = study_xgb.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

Trying LGBMClassifier with separated male/female data sets

In [None]:
# study_lgbm_male = optuna.create_study(direction = 'maximize')
# study_lgbm_male.optimize(lambda trial: objective(trial, 'lgbm', X_train_full_male, y_train_full_male), n_trials = 500, timeout = 600)

In [None]:
# pruned_trials = study_lgbm_male.get_trials(deepcopy=False, states=[TrialState.PRUNED])
# complete_trials = study_lgbm_male.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# print("LightGBM male study statistics: ")
# print("  Number of finished trials: ", len(study_lgbm_male.trials))
# print("  Number of pruned trials: ", len(pruned_trials))
# print("  Number of complete trials: ", len(complete_trials))

# print("Best trial:")
# trial = study_lgbm_male.best_trial

# print("  Value: ", trial.value)

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

In [None]:
# study_lgbm_female = optuna.create_study(direction = 'maximize')
# study_lgbm_female.optimize(lambda trial: objective(trial, 'lgbm', X_train_full_female, y_train_full_female), n_trials = 500, timeout = 600)

In [None]:
# study_lgbm_female = optuna.create_study(direction = 'maximize')
# study_lgbm_female.optimize(lambda trial: objective(trial, 'lgbm', X_train_full_female, y_train_full_female), n_trials = 500, timeout = 600)pruned_trials = study_lgbm_female.get_trials(deepcopy=False, states=[TrialState.PRUNED])
# complete_trials = study_lgbm_female.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# print("LightGBM female study statistics: ")
# print("  Number of finished trials: ", len(study_lgbm_female.trials))
# print("  Number of pruned trials: ", len(pruned_trials))
# print("  Number of complete trials: ", len(complete_trials))

# print("Best trial:")
# trial = study_lgbm_female.best_trial

# print("  Value: ", trial.value)

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

# 5.4 Creating tuned models

In [None]:
study_nn.best_params

In [None]:
nn = keras.Sequential()

for i in range(study_nn.best_params['n_layers']):
    if (i == 0):
        nn.add(layers.BatchNormalization(input_shape = [X_train_full.shape[1]]))
        nn.add(layers.Dense(units = study_nn.best_params['units_0'], activation = 'relu'))
        nn.add(layers.Dropout(rate = study_nn.best_params['rate_0']))
    else:
        str_units = 'units_' + str(i)
        str_rate = 'rate_' + str(i)
        nn.add(layers.BatchNormalization())
        nn.add(layers.Dense(units = study_nn.best_params[str_units], activation = 'relu'))
        nn.add(layers.Dropout(rate = study_nn.best_params[str_rate]))

nn.add(layers.BatchNormalization())
nn.add(layers.Dense(units = 1, activation = 'sigmoid'))

opt = getattr(keras.optimizers, study_nn.best_params['optimizer'])(nn.optimizer, lr = study_nn.best_params['lr'])

nn.compile(optimizer = opt, 
           loss = 'binary_crossentropy', 
           metrics = ['binary_accuracy'])

nn.fit(X_train_full, y_train_full,
       batch_size = study_nn.best_params['batch_size'],
       epochs = study_nn.best_params['epochs'],
       verbose = 0,)

In [None]:
lgbm = LGBMClassifier(**study_lgbm.best_params)
cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent', **study_cb.best_params)
lsvc = LinearSVC(**study_lsvc.best_params)
logreg = LogisticRegression(solver = 'saga', **study_logreg.best_params)
rf = RandomForestClassifier(**study_rf.best_params)
xgb = XGBClassifier(n_estimators = trial.user_attrs["n_estimators"], **study_xgb.best_params)

In [None]:
cb.fit(X_train_full, y_train_full)
lgbm.fit(X_train_full, y_train_full)
lsvc.fit(X_train_full, y_train_full)
logreg.fit(X_train_full, y_train_full)
rf.fit(X_train_full, y_train_full)
xgb.fit(X_train_full, y_train_full)

In [None]:
predictions_cb = cb.predict(X_test)
predictions_lgbm = lgbm.predict(X_test)
predictions_lsvc = lsvc.predict(X_test)
predictions_logreg = logreg.predict(X_test)
predictions_rf = rf.predict(X_test)
predictions_xgb = xgb.predict(X_test)

In [None]:
predictions_nn = nn.predict(X_test)
predictions_nn[predictions_nn > 0.5] = 1
predictions_nn[predictions_nn <= 0.5] = 0

# 5.5 Voting ensemble

In [None]:
submission = pd.DataFrame()

In [None]:
submission['PassengerId'] = X_test.index
submission['pr_nn'] = predictions_nn.flatten().astype('int64')
submission['pr_cb'] = predictions_cb
submission['pr_lgbm'] = predictions_lgbm
submission['pr_lsvc'] = predictions_lsvc
submission['pr_logreg'] = predictions_logreg
submission['pr_rf'] = predictions_rf
submission['pr_xgb'] = predictions_xgb

In [None]:
submission[[col for col in submission.columns if col.startswith('pr_')]].sum(axis = 1).value_counts()

In [None]:
submission['Survived'] = (submission[[col for col in submission.columns if col.startswith('pr_')]].sum(axis=1) >= 4).astype(int)
submission

In [None]:
submission[['PassengerId', 'Survived']].to_csv('submission_voting.csv', index = False)

In [None]:
submission_svc = pd.DataFrame({'PassengerId': X_test.index,
                               'Survived': submission.pr_lsvc})
submission_svc.to_csv('submission_svc.csv', index = False)

In [None]:
submission_cat = pd.DataFrame({'PassengerId': X_test.index,
                               'Survived': submission.pr_cb})
submission_cat.to_csv('submission_cat.csv', index = False)

In [None]:
submission_nn = pd.DataFrame({'PassengerId': X_test.index,
                               'Survived': submission.pr_nn})
submission_nn.to_csv('submission_nn.csv', index = False)

In [None]:
submission_lgbm = pd.DataFrame({'PassengerId': X_test.index,
                                'Survived': submission.pr_lgbm})
submission_lgbm.to_csv('submission_lgbm.csv', index = False)

# 5.6 Stacking ensemble

Decided not to use it, because it takes too much time to run and doesn't improve accuracy.

In [None]:
# from sklearn.ensemble import StackingClassifier, VotingClassifier
# from keras.wrappers.scikit_learn import KerasClassifier

# def NeuralNetwork():
#     nn = keras.Sequential()

#     for i in range(study_nn.best_params['n_layers']):
#         if (i == 0):
#             nn.add(layers.BatchNormalization(input_shape = [X_train_full.shape[1]]))
#             nn.add(layers.Dense(units = study_nn.best_params['units_0'], activation = 'relu'))
#             nn.add(layers.Dropout(rate = study_nn.best_params['rate_0']))
#         else:
#             str_units = 'units_' + str(i)
#             str_rate = 'rate_' + str(i)
#             nn.add(layers.BatchNormalization())
#             nn.add(layers.Dense(units = study_nn.best_params[str_units], activation = 'relu'))
#             nn.add(layers.Dropout(rate = study_nn.best_params[str_rate]))

#     nn.add(layers.BatchNormalization())
#     nn.add(layers.Dense(units = 1, activation = 'sigmoid'))

#     opt = getattr(keras.optimizers, study_nn.best_params['optimizer'])(nn.optimizer, lr = study_nn.best_params['lr'])

#     nn.compile(optimizer = opt, 
#                loss = 'binary_crossentropy', 
#                metrics = ['binary_accuracy'])
    
#     return nn

# nn = KerasClassifier(build_fn = NeuralNetwork, 
#                      epochs = study_nn.best_params['epochs'],
#                      batch_size = study_nn.best_params['batch_size'], 
#                      verbose = 0)

# nn._estimator_type = 'classifier'

# lgbm = LGBMClassifier(**study_lgbm.best_params)
# cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent', **study_cb.best_params)
# lsvc = LinearSVC(**study_lsvc.best_params)
# logreg = LogisticRegression(solver = 'saga', **study_logreg.best_params)
# rf = RandomForestClassifier(**study_rf.best_params)
# xgb = XGBClassifier(n_estimators = trial.user_attrs["n_estimators"], **study_xgb.best_params)

# estimators = [
#     ('0', nn),
#     ('1', cb),
#     ('2', lgbm),
#     ('3', lsvc),
#     ('4', logreg),
#     ('5', rf),
#     ('6', xgb),
# ]

# stacked = StackingClassifier(estimators = estimators, final_estimator = logreg, 
#                              verbose = 0, cv = 5)
# stacked.fit(X_train_full, y_train_full)

# predictions = stacked.predict(X_test)

In [None]:
# submission_stacked = pd.DataFrame({'PassengerId': X_test.index, 
#                                    'Survived': predictions})
# submission_stacked.to_csv('submission_stacked.csv', index = False)