# № 0. Getting started

### 1. Basic libraries

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt

Some global parametres for plots:

In [None]:
sns.set_theme(rc = {'grid.linewidth': 0.6, 'grid.color': 'white',
                    'axes.linewidth': 1, 'axes.facecolor': '#ECECEC', 
                    'axes.labelcolor': '#000000',
                    'figure.facecolor': 'white',
                    'xtick.color': '#000000', 'ytick.color': '#000000'})

### 2. Data

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

# № 1. Preliminary analysis and missing values

<h2><center> The first glimpse </center></h2>

In [None]:
df_train.head(2)

In [None]:
df_test.head(2)

<h2><center> Getting some info about columns </center></h2>

### 1. Comparing column names

In [None]:
df_train.columns.difference(df_test.columns).tolist()

### 2. A handmade <span style="color:#E85E40"> info() </span> function

In [None]:
train_info = pd.DataFrame(zip(df_train.columns, df_train.count(), 
                              df_train.nunique(), df_train.dtypes))

train_info.columns = ['Column', 'Count', 'Unique values', 'Dtype']

test_info = pd.DataFrame(zip(df_test.columns, df_test.count(), 
                             df_test.nunique(), df_test.dtypes))

test_info.columns = ['Column', 'Count', 'Unique values', 'Dtype']

pd.concat([train_info, test_info], axis = 1, join = 'outer', 
           keys = ['Train', 'Test'], ignore_index = False)

It is quite handy when you can see all at once: column names, counts, unique counts and data types. I bet there is a function for it; however, I just wanted to practice a little bit.

### 3. Getting unique values of each categorical variable

If you want to make sure that categorical features do not have any incorrectly labelled values, you should check what unique categories they have. In this case, it was done via creating tables with possible values for each variable.

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

In [None]:
df_nunique = {var: pd.DataFrame(df_train[var].value_counts()) 
              for var in {'Survived', 'Pclass', 'Sex', 
                          'SibSp', 'Parch', 'Embarked'}}

In [None]:
multi_table([df_nunique['Survived'], df_nunique['Pclass'], 
             df_nunique['Sex'], df_nunique['SibSp'], 
             df_nunique['Parch'], df_nunique['Embarked']])

### 4. Changing data type of some columns

In [None]:
for column in {'Sex', 'SibSp', 'Parch', 'Embarked'}:
    df_train[column] = df_train[column].astype('category')
    
for column in {'Sex', 'SibSp', 'Parch', 'Embarked'}:
    df_test[column] = df_test[column].astype('category')

<h2><center> Missing values </center></h2>

In [None]:
round((df_train.isnull().sum()/len(df_train)*100).sort_values(
       ascending = False), 1)

In [None]:
round((df_test.isnull().sum()/len(df_test)*100).sort_values(
       ascending = False), 1)

### 1. Dealing with "Cabin"

A decision was made to remove "Cabin" since it had plenty of NaN values (almost 80%). It was unlikely that this feature would be helpful.

In [None]:
df_train.drop('Cabin', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)

### 2. Dealing with "Age"

#### 2.1 Training set 

NaNs in "Age" were replaced with the mean value. In order to increase the accuracy, "Age" needed to be broken down into smaller categories which were yet to be defined. Needless to say, these categories had to establish a meaningful connection with "Age" to be useful. To make my analysis easier, I plotted some graphs, combining <span style="color:#E85E40"> kdeplots </span> with <span style="color:#E85E40"> stripplots </span>.

In [None]:
train_no_NA = df_train.dropna()

train_cat_visual_0 = train_no_NA[['Sex', 'SibSp', 'Parch', 'Embarked', 
                                  'Pclass', 'Survived']].columns.tolist()

In [None]:
with plt.rc_context(rc = {'figure.dpi': 300, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(2, 3, figsize = (10, 6))

    for indx, (column, axes) in list(enumerate(list(zip(train_cat_visual_0, ax.flatten())))):
    
        sns.violinplot(ax = axes, x = train_no_NA[column], 
                       y = train_no_NA['Age'],
                       scale = 'width', linewidth = 0.5, 
                       palette = 'viridis', inner = None)
    
        plt.setp(axes.collections, alpha = 0.3)
    
        sns.stripplot(ax = axes, x = train_no_NA[column], 
                  y = train_no_NA['Age'],
                  palette = 'viridis', alpha = 0.9, 
                  s = 1.5, jitter = 0.1)
    
        sns.pointplot(ax = axes, x = train_no_NA[column],
                  y = train_no_NA['Age'],
                  color = '#ff5736', scale = 0.25,
                  estimator = np.median, ci = 'sd',
                  errwidth = 0.5, capsize = 0.15, join = True)
    
        plt.setp(axes.lines, zorder = 100)
        plt.setp(axes.collections, zorder = 100)
    
    else:
        
        [axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
    
plt.tight_layout()
plt.show()

Analysing these graphs, I concluded that variables such as "Survived", "Pclass", "Sex" and "SibSp" were valuable in predicting "Age". These features established a more or less clear connection with "Age" and, on top of that, the standard deviations were not too high, compared to other variables, which was crucial as I could be more certain that imputed values would be accurate enough. Note, I didn’t use "Survived" because it was not available in the test set.

"Pclass", "Sex" and "SibSp" were chosen for predicting "Age". 

In [None]:
Pclass_count_train = pd.DataFrame({'Count': df_train.groupby(['Pclass', 'Sex', 'SibSp']).size()})

Pclass_mean_std_train = round(pd.pivot_table(df_train, 
                              index = ['Pclass', 'Sex', 'SibSp'], 
                              values = 'Age', aggfunc = (np.median, np.std)), 0)

Pclass_count_train.join(Pclass_mean_std_train, how = 'outer')

Since some groups had NaN values, I repeated the process of aggregating twice. For the second time I removed "SibSp" before grouping.

In [None]:
df_train.loc[df_train['Age'].isnull(), 'Age'] = df_train.groupby(['Pclass', 'Sex', 'SibSp'])['Age'].transform('median')

In [None]:
df_train.loc[df_train['Age'].isnull(), 'Age'] = df_train.groupby(['Pclass', 'Sex'])['Age'].transform('median')

#### 2.2 Testing set 

In [None]:
df_test.loc[df_test['Age'].isnull(), 'Age'] = df_test.groupby(['Pclass', 'Sex', 'SibSp'])['Age'].transform('median')

In [None]:
df_test.loc[df_test['Age'].isnull(), 'Age'] = df_test.groupby(['Pclass', 'Sex'])['Age'].transform('median')

### 3. Dealing with "Embarked"

In [None]:
df_train[df_train['Embarked'].isnull()]

Since there were only two NaN values, it made sense not to overthink it and replace values with the mode.

In [None]:
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace = True)

### 4. Dealing with "Fare"

In [None]:
df_test[df_test['Fare'].isnull()]

I went for the same strategy for "Fare". The only NaN value was replaced with the mean.

In [None]:
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace = True)

# № 2. Further analysis and feature engineering

<h2><center> Getting rid of some columns </center></h2>

In [None]:
df_train.drop('PassengerId', axis = 1, inplace = True)
df_test.drop('PassengerId', axis = 1, inplace = True)

df_train.drop('Name', axis = 1, inplace = True)
df_test.drop('Name', axis = 1, inplace = True)

<h2><center> Exploring categorical variables </center></h2>

### 1. Plotting categorical variables

In [None]:
train_cat_visual_1 = df_train.select_dtypes(
                     include = ['object', 'category']).columns.tolist()

train_cat_visual_1.remove('Ticket')
train_cat_visual_1.append('Pclass')

In [None]:
my_palette_0 = ['#481567FF', '#238A8DFF']

with plt.rc_context(rc = {'figure.dpi': 250, 'axes.labelsize': 6.5, 
                          'xtick.labelsize': 5.5, 'ytick.labelsize': 5.5,
                          'legend.fontsize': 5.5, 'legend.title_fontsize': 6}):

    fig, ax = plt.subplots(2, 3, figsize = (8, 5))

    for indx, (column, axes) in list(enumerate(list(zip(train_cat_visual_1, ax.flatten())))):
    
        sns.countplot(ax = axes, x = df_train[column], hue = df_train['Survived'], 
                      palette = my_palette_0, alpha = 0.8)
    
    else:
        
        [axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
    
    axes_legend = ax.flatten()

    axes_legend[1].legend(title = 'Survived', loc = 'upper right')
    axes_legend[2].legend(title = 'Survived', loc = 'upper right')

plt.tight_layout()
plt.show()

### 2. Grouped tables for categorical variables

In [None]:
df_groupby = {var: pd.DataFrame(df_train.groupby([var, 'Survived']).size()) 
              for var in {'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'}}

In [None]:
multi_table([df_groupby['Pclass'], df_groupby['Sex'], df_groupby['SibSp'], 
             df_groupby['Parch'], df_groupby['Embarked']])

### 3.Unbalanced features

After exploring graphs and tables, I could clearly see that "SibSp" and "Parch" had plenty of levels that contained just few values. These features were imbalanced, so it made sense to first combine them and then regroup the new variable.

In [None]:
df_train['Family_size'] = df_train['SibSp'].astype('int') + df_train['Parch'].astype('int') + 1

df_test['Family_size'] = df_test['SibSp'].astype('int') + df_test['Parch'].astype('int') + 1

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):
    
    fig, ax = plt.subplots(1, 1, figsize = (6, 3.5))

    sns.countplot(x = df_train['Family_size'], hue = df_train['Survived'], 
                  palette = my_palette_0, alpha = 0.8)

    ax.legend(loc = 'upper right', title = 'Survived')

plt.show()

In [None]:
conditions_0 = [(df_train['Family_size'] == 1),
                (df_train['Family_size'] >= 2) & (df_train['Family_size'] < 4),
                (df_train['Family_size'] == 4),
                (df_train['Family_size'] > 4)]

values_0 = ['Alone', 'Small', 'Medium', 'Large']

In [None]:
df_train['Family_s'] = np.select(conditions_0, values_0)

In [None]:
conditions_1 = [(df_test['Family_size'] == 1),
                (df_test['Family_size'] >= 2) & (df_test['Family_size'] < 4),
                (df_test['Family_size'] == 4),
                (df_test['Family_size'] > 4)]

values_1 = ['Alone', 'Small', 'Medium', 'Large']

In [None]:
df_test['Family_s'] = np.select(conditions_1, values_1)

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 1, figsize = (6, 3.5))

    sns.countplot(x = df_train['Family_s'], hue = df_train['Survived'], 
                  palette = my_palette_0, alpha = 0.8)

    plt.show()

Lastly, I dropped unnecessary features.

In [None]:
df_train.drop(['SibSp', 'Parch', 'Family_size'], axis = 1, inplace = True)
df_test.drop(['SibSp', 'Parch', 'Family_size'], axis = 1, inplace = True)

In [None]:
df_train.head(2)

### 4. High cardinality features

When you encounter a variable with too many unique values, it can be useful to create a new variable with counts / frequencies instead of the categories. "Ticket" was transformed this way.

In [None]:
Ticket_count_map_0 = df_train.groupby('Ticket').size().to_dict()
df_train['Ticket_count'] = df_train['Ticket'].map(Ticket_count_map_0)

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(1, 1, figsize = (6, 3.5))

    sns.countplot(x = df_train['Ticket_count'], palette = 'viridis', alpha = 0.85)
    
    plt.show()

So, instead of 681 unique values we got only 7:

In [None]:
df_train['Ticket'].nunique()

In [None]:
df_train['Ticket_count'].nunique()

The same was done for the test set.

In [None]:
Ticket_count_map_1 = df_test.groupby('Ticket').size().to_dict()
df_test['Ticket_count'] = df_test['Ticket'].map(Ticket_count_map_1)

In [None]:
df_train.drop('Ticket', axis = 1, inplace = True)
df_test.drop('Ticket', axis = 1, inplace = True)

<h2><center> Exploring numeric variables </center></h2>

In [None]:
df_train.select_dtypes(include = ['float64']).describe().T.round(1)

In [None]:
train_num_visual_0 = df_train.select_dtypes(include = ['float64']).columns.tolist()

In [None]:
with plt.rc_context(rc = {'figure.dpi': 200, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                         'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 2, figsize = (12, 4))

    for indx, (column, axes) in list(enumerate(list(zip(train_num_visual_0, ax.flatten())))):
    
        sns.scatterplot(ax = axes, y = df_train[column].index, x = df_train[column], 
                        hue = df_train['Survived'], palette = my_palette_0, s = 15)
    
    else:
        
        [axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
    
    ax[1].legend(title = 'Survived', loc = 'upper right')
    
    plt.tight_layout()
    plt.show()

In [None]:
with plt.rc_context(rc = {'figure.dpi': 200, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 2, figsize = (12, 4))

    for indx, (column, axes) in list(enumerate(list(zip(train_num_visual_0, ax.flatten())))):
    
        sns.histplot(ax = axes, x = np.log(df_train[column] + 1), hue = df_train['Survived'], 
                     palette = my_palette_0, alpha = 0.8, multiple = 'stack')
    
        legend = axes.get_legend() # sns.hisplot has some issues with legend
        handles = legend.legendHandles
        legend.remove()
        axes.legend(handles, ['0', '1'], title = 'Survived', loc = 'upper right')
    
        Quantiles = np.quantile(np.log(df_train[column] + 1), [0, 0.25, 0.50, 0.75, 1])
    
        for q in Quantiles: axes.axvline(x = q, linewidth = 0.8, color = '#ff5736')
        
plt.tight_layout()
plt.show()

I could see that "Fare" was significantly skewed. Nonetheless, I didn’t want to use binning in this case, as I didn’t see a clear way of doing it. It seemed a little bit arbitrary. Frankly speaking, I thought that leaving these variables as they were would be a better idea than somehow "rebuilding" them.

<h2><center> Encoding </center></h2>

### 1. Encoding ordinal features

I treated "Pclass" as an ordinal variable due to the fact that it represented socio-economic status. Taking into account that "Pclass" was already correctly mapped, I did not need to encode it.

### 2. Encoding nominal features

#### 2.1 Training set

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
X_train = df_train.copy()
X_test = df_test.copy()

In [None]:
OHE =  OneHotEncoder(sparse = False, handle_unknown = 'ignore')

X_train_OHE = pd.DataFrame(pd.DataFrame(OHE.fit_transform(X_train[['Sex', 
                           'Embarked', 'Family_s']])))

X_train_OHE.columns = OHE.get_feature_names(['Sex', 'Embarked', 'Family_s'])

X_train.drop(['Sex', 'Embarked', 'Family_s', 'Survived'], axis = 1, 
             inplace = True)

X_train = pd.concat([X_train, X_train_OHE ], axis = 1)

In [None]:
y = df_train['Survived'].copy()

#### 2.1 Testing set

In [None]:
X_test_OHE = pd.DataFrame(pd.DataFrame(OHE.fit_transform(X_test[['Sex', 
                          'Embarked', 'Family_s']])))

X_test_OHE.columns = OHE.get_feature_names(['Sex', 'Embarked', 'Family_s'])

X_test.drop(['Sex', 'Embarked', 'Family_s'], axis = 1, inplace = True)

X_test = pd.concat([X_test, X_test_OHE ], axis = 1)

# № 3. Training models

<div style = "color: #000000;
             display: fill;
             padding: 8px;
             border-radius: 5px;
             border-style: solid;
             border-color: #a63700;
             background-color: rgba(235, 125, 66, 0.3)">
    
<span style = "font-size: 20px; font-weight: bold">Note:</span> 
<span style="font-size: 15px">I used a sequential grid search because the data set was quite small; nevertheless, it still took me quite some time to finish parameter tuning. On the other hand, a randomized search that I did not cover in this notebook (but heavily relied on in my other <a href="https://www.kaggle.com/suprematism/top-7-useful-graphs-and-encoding-techniques">notebook</a>) gave me nearly the same leaderboard scores. So, by and large, I guess you should not opt for a grid search.</span>
</div>

### 1. XGBoost

I built XGBoost using early stopping and I also tried plotting some graphs. Hopefully, they can be helpful to readers.

In [None]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_train, y, 
                                   test_size= 0.2, random_state = 999)

#### 1.1 Tuning "learning_rate" and "n_estimators"

In [None]:
CV = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 999)

XGB_0 = xgb.XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

XGB_param_0 = {'n_estimators': [20, 50, 100, 200, 300],
               'learning_rate': [0.01, 0.05, 0.07, 0.1],
               'max_depth': [4],
               'min_child_weight': [4],
               'gamma': [0.2],
               'subsample': [0.9],
               'colsample_bytree': [0.9]}

XGB_grid_0 = GridSearchCV(XGB_0, XGB_param_0, verbose = False, 
                          scoring = 'neg_log_loss', cv = CV) # n_jobs = 6

XGB_tuned_0 = XGB_grid_0.fit(x_train, y_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_test]], 
                              eval_metric = 'logloss', verbose = False)

In [None]:
round(-1*XGB_tuned_0.best_score_, 3)

In [None]:
round(log_loss(y_test, XGB_tuned_0.predict_proba(x_test)), 3)

In [None]:
XGB_tuned_0_results = pd.DataFrame(XGB_tuned_0.cv_results_)[['mean_test_score', 'param_n_estimators', 'param_learning_rate']]

XGB_tuned_0_results['mean_test_score'] = -1*XGB_tuned_0_results['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = XGB_tuned_0_results, x = 'param_n_estimators', 
                 y = 'mean_test_score', hue = 'param_learning_rate', 
                 marker = 'o', palette = 'viridis')

    plt.show()

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(1, 1, figsize = (4, 3))

    sns.heatmap(confusion_matrix(y_test, XGB_tuned_0.predict(x_test)), 
                annot = True, fmt = 'd', cmap = 'YlGnBu', 
                annot_kws = {'fontsize': 8})

    plt.show()

In [None]:
XGB_tuned_0.best_params_

#### 1.2 Tuning "max_depth" and "min_child_weight"

In [None]:
XGB_1 = xgb.XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

XGB_param_1 = {'n_estimators': [XGB_tuned_0.best_params_.get('n_estimators')],
         'learning_rate': [XGB_tuned_0.best_params_.get('learning_rate')],
         'max_depth': [4, 5, 6],
         'min_child_weight': [2, 3, 4, 5],
         'gamma': [XGB_tuned_0.best_params_.get('gamma')],
         'subsample': [XGB_tuned_0.best_params_.get('subsample')],
         'colsample_bytree': [XGB_tuned_0.best_params_.get('colsample_bytree')]}

XGB_grid_1 = GridSearchCV(XGB_1, XGB_param_1, verbose = False, 
                          scoring = 'neg_log_loss', cv = CV) # n_jobs = 6

XGB_tuned_1 = XGB_grid_1.fit(x_train, y_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_test]], 
                              eval_metric = 'logloss', verbose = False)

In [None]:
round(-1*XGB_tuned_1.best_score_, 3)

In [None]:
round(log_loss(y_test, XGB_tuned_1.predict_proba(x_test)), 3)

In [None]:
XGB_tuned_1_results = pd.DataFrame(XGB_tuned_1.cv_results_)[['mean_test_score', 'param_max_depth', 'param_min_child_weight']]

XGB_tuned_1_results['mean_test_score'] = -1*XGB_tuned_1_results['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = XGB_tuned_1_results, x = 'param_max_depth', 
                 y = 'mean_test_score', 
                 hue = 'param_min_child_weight', marker = 'o', palette = 'viridis')

    ax.legend(title = 'param_min_child_weight', ncol = 2)
    plt.show()

In [None]:
XGB_tuned_1.best_params_

#### 1.3 Tuning "gamma", "subsample" and "colsample_bytree"

In [None]:
XGB_2 = xgb.XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

XGB_param_2 = {'n_estimators': [XGB_tuned_1.best_params_.get('n_estimators')],
          'learning_rate': [XGB_tuned_1.best_params_.get('learning_rate')],
          'max_depth': [XGB_tuned_1.best_params_.get('max_depth')],
          'min_child_weight': [XGB_tuned_1.best_params_.get('min_child_weight')],
          'gamma': [0.2, 0.3, 0.5, 0.7, 1, 2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}

XGB_grid_2 = GridSearchCV(XGB_2, XGB_param_2, verbose = False, 
                          scoring = 'neg_log_loss', cv = CV) # n_jobs = 6

XGB_tuned_2 = XGB_grid_2.fit(x_train, y_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_test]], 
                              eval_metric = 'logloss', verbose = False)

In [None]:
round(-1*XGB_tuned_2.best_score_, 3)

In [None]:
round(log_loss(y_test, XGB_tuned_2.predict_proba(x_test)), 3)

In [None]:
XGB_tuned_2.best_params_

#### 1.4 Retuning "learning_rate"

In [None]:
XGB_3 = xgb.XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

XGB_param_3 = {'n_estimators': [XGB_tuned_2.best_params_.get('n_estimators')],
          'learning_rate': [x / 100.0 for x in range(1, 20, 1)],
          'max_depth': [XGB_tuned_2.best_params_.get('max_depth')],
          'min_child_weight': [XGB_tuned_2.best_params_.get('min_child_weight')],
          'gamma': [XGB_tuned_2.best_params_.get('gamma')],
          'subsample': [XGB_tuned_2.best_params_.get('subsample')],
          'colsample_bytree': [XGB_tuned_2.best_params_.get('colsample_bytree')]}

XGB_grid_3 = GridSearchCV(XGB_3, XGB_param_3, verbose = False, 
                          scoring = 'neg_log_loss', cv = CV) # n_jobs = 6

XGB_tuned_3 = XGB_grid_3.fit(x_train, y_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_test]], 
                              eval_metric = 'logloss', verbose = False)

In [None]:
round(-1*XGB_tuned_3.best_score_, 3)

In [None]:
round(log_loss(y_test, XGB_tuned_3.predict_proba(x_test)), 3)

In [None]:
XGB_tuned_3_results = pd.DataFrame(XGB_tuned_3.cv_results_)[['mean_test_score','param_learning_rate']]

XGB_tuned_3_results['mean_test_score'] = -1*XGB_tuned_3_results['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = XGB_tuned_3_results, x = 'param_learning_rate', 
                 y = 'mean_test_score', marker = 'o', color = '#481567FF')

    plt.show()

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(1, 1, figsize = (4, 3))

    sns.heatmap(confusion_matrix(y_test, XGB_tuned_3.predict(x_test)), 
                annot = True, fmt = 'd', cmap = 'YlGnBu',
                annot_kws = {'fontsize': 8})

    plt.show()

In [None]:
XGB_tuned_3.best_params_

#### 1.5 Training the final model

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [None]:
CV_final = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 999)

XGB_final = xgb.XGBClassifier(
        use_label_encoder = False, objective = 'binary:logistic',
        eval_metric = 'logloss',
        n_estimators = XGB_tuned_3.best_params_.get('n_estimators'),
        learning_rate = XGB_tuned_3.best_params_.get('learning_rate'),
        max_depth = XGB_tuned_3.best_params_.get('max_depth'),
        min_child_weight = XGB_tuned_3.best_params_.get('min_child_weight'),
        gamma = XGB_tuned_3.best_params_.get('gamma'),
        subsample = XGB_tuned_3.best_params_.get('subsample'),
        colsample_bytree = XGB_tuned_3.best_params_.get('colsample_bytree'))

XGB_CV_scores = cross_val_score(XGB_final, X_train, y, 
                                scoring = 'neg_log_loss', cv = CV_final)

XGB_fit_final = XGB_final.fit(X_train, y, verbose = False)

In [None]:
round(-1*np.average(XGB_CV_scores), 3)

In [None]:
y_pred_XGB = XGB_fit_final.predict(X_test)

### 2. Random Forest

One more model that I decided to try was RF.

In [None]:
from sklearn.ensemble import RandomForestClassifier

#### 2.1 Tuning "n_estimators", "max_features" and "max_depth"

In [None]:
RF_0 = RandomForestClassifier()

RF_param_0 = {'n_estimators': [850, 900, 950, 1000],
              'max_features': [6, 7],
              'max_depth': [6, 7],
              'criterion': ['gini']}

RF_grid_0 = GridSearchCV(RF_0, RF_param_0, cv = CV, verbose = False, 
                         scoring = 'neg_log_loss') # n_jobs = 6

RF_tuned_0 = RF_grid_0.fit(X_train, y)

In [None]:
round(-1*RF_tuned_0.best_score_, 3)

In [None]:
RF_results_0 = pd.DataFrame(RF_tuned_0.cv_results_)[['mean_test_score', 'param_n_estimators', 'param_max_features', 'param_max_depth']]

RF_results_0['mean_test_score'] = -1*RF_results_0['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7,
                          'legend.fontsize': 7, 'legend.title_fontsize': 7.5}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = RF_results_0, x = 'param_n_estimators', 
                 y = 'mean_test_score', hue = 'param_max_features', 
                 style =  'param_max_depth', palette = 'viridis', ci = None)

    ax.legend(ncol = 1, loc = 'upper left')
    plt.show()

In [None]:
RF_tuned_0.best_params_

#### 2.2 Tuning "min_samples_split"

In [None]:
RF_1 = RandomForestClassifier()

RF_param_1 = {'n_estimators': [RF_tuned_0.best_params_.get('n_estimators')],
             'max_features': [RF_tuned_0.best_params_.get('max_features')],
             'max_depth': [RF_tuned_0.best_params_.get('max_depth')],
             'criterion': [RF_tuned_0.best_params_.get('criterion')],
             'min_samples_split': [2, 3, 4, 5]}

RF_grid_1 = GridSearchCV(RF_1, RF_param_1, cv = CV, verbose = False, 
                         scoring = 'neg_log_loss') # n_jobs = 6 

RF_tuned_1 = RF_grid_1.fit(X_train, y)

In [None]:
round(-1*RF_tuned_1.best_score_, 3)

In [None]:
RF_results_1 = pd.DataFrame(RF_tuned_1.cv_results_)[['mean_test_score', 'param_min_samples_split']]

RF_results_1['mean_test_score'] = -1*RF_results_1['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = RF_results_1, x = 'param_min_samples_split', 
                 y = 'mean_test_score', marker = 'o', color = '#481567FF')

    plt.show()

In [None]:
RF_tuned_1.best_params_

#### 2.3 Tuning "max_leaf_nodes"

In [None]:
RF_2 = RandomForestClassifier()

RF_param_2 = {'n_estimators': [RF_tuned_1.best_params_.get('n_estimators')],
              'max_features': [RF_tuned_1.best_params_.get('max_features')],
              'max_depth': [RF_tuned_1.best_params_.get('max_depth')],
              'criterion': [RF_tuned_1.best_params_.get('criterion')],
              'min_samples_split': [RF_tuned_1.best_params_.get('min_samples_split')], 
              'max_leaf_nodes': list(range(20, 35, 1))}

RF_grid_2 = GridSearchCV(RF_2, RF_param_2, cv = CV, verbose = False, 
                         scoring = 'neg_log_loss') # n_jobs = 6

RF_tuned_2 = RF_grid_2.fit(X_train, y)

In [None]:
round(-1*RF_tuned_2.best_score_, 3)

In [None]:
RF_results_2 = pd.DataFrame(RF_tuned_2.cv_results_)[['mean_test_score', 'param_max_leaf_nodes']]

RF_results_2['mean_test_score'] = -1*RF_results_2['mean_test_score']

In [None]:
with plt.rc_context(rc = {'figure.dpi': 110, 'axes.labelsize': 8, 
                          'xtick.labelsize': 7, 'ytick.labelsize': 7}):

    fig, ax = plt.subplots(1, 1, figsize = (7, 4))

    sns.lineplot(data = RF_results_2, x = 'param_max_leaf_nodes', 
                 y = 'mean_test_score', marker = 'o', color = '#481567FF')

    plt.show()

In [None]:
RF_tuned_2.best_params_

#### 2.4 Training the final model

In [None]:
RF_final = RandomForestClassifier(
       n_estimators = RF_tuned_2.best_params_.get('n_estimators'),
       max_features = RF_tuned_2.best_params_.get('max_features'),
       max_depth = RF_tuned_2.best_params_.get('max_depth'),
       criterion = RF_tuned_2.best_params_.get('criterion'),
       min_samples_split = RF_tuned_2.best_params_.get('min_samples_split'), 
       max_leaf_nodes = RF_tuned_2.best_params_.get('max_leaf_nodes'))

RF_CV_scores = cross_val_score(RF_final, X_train, y, scoring = 'neg_log_loss', 
                               cv = CV_final)

RF_fit_final = RF_final.fit(X_train, y)

In [None]:
round(-1*np.average(RF_CV_scores), 3)

In [None]:
y_pred_RF = RF_fit_final.predict(X_test)

<div style = "color: #000000;
             display: fill;
             padding: 8px;
             border-radius: 5px;
             border-style: solid;
             border-color: #a63700;
             background-color: rgba(235, 125, 66, 0.3)">
    
<span style = "font-size: 20px; font-weight: bold">Note:</span> 
<span style="font-size: 15px">Taking into account that XGB test scores ("logloss") were higher than that of RF, I was inclined to believe that XGB was overfitting. In order to prevent that, further feature engineering was necessary. Thus, out of 2 models I picked RF.
</div>

## Writing this notebook helped me get started with Python. Tackling a great number of interesting and to some extent practical problems, I was able to structure my knowledge which essentially was my primary purpose. I hope that by sharing it, I will help someone out!

In [None]:
df_test = pd.read_csv('../input/titanic/test.csv')

my_submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 
                              'Survived': y_pred_RF})

my_submission.to_csv('submission.csv', index = False)