![](https://storage.googleapis.com/kaggle-competitions/kaggle/26478/logos/header.png?t=2021-03-29-17-07-0)

In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split, cross_val_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import KFold
import optuna

pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
ID = test['PassengerId']

# Basic information

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

Great, we have missing values! Let's impute them.

# Preprocessing

**I will use all information to imputing missing values**

In [None]:
all_data = pd.concat([train, test])

**Imputing embarked**

In [None]:
emb = pd.DataFrame(all_data.groupby(['Pclass', 'Sex', 'SibSp'])['Embarked'].apply(pd.Series.mode).reset_index()).drop('level_3', axis = 1)

for i in range(len(train.index)):
    if pd.isna(train.iloc[i,11]) == True:
        if train.iloc[i,2] == 1 and train.iloc[i,4] == 'female' and train.iloc[i,6] <= 2:
            train.iloc[i,11] = 'C'
        else:
            train.iloc[i,11] = 'S'
            
for i in range(len(test.index)):
    if pd.isna(test.iloc[i,10]) == True:
        if test.iloc[i,1] == 1 and test.iloc[i,3] == 'female' and test.iloc[i,5] <= 2:
            test.iloc[i,10] = 'C'
        else:
            test.iloc[i,10] = 'S'

**Imputing age**

In [None]:
ages = all_data.groupby(['Pclass', 'Sex', 'SibSp', 'Embarked']).agg({'Age': 'mean'}).reset_index()

for i in range(len(train.index)):
    if pd.isna(train.iloc[i,5]) == True:
        for k in range(len(ages.index)):
            if train.iloc[i,2] == ages.iloc[k,0] and train.iloc[i,4] == ages.iloc[k,1] and train.iloc[i,6] == ages.iloc[k,2] and train.iloc[i,11] == ages.iloc[k,3]:
                train.iloc[i,5] = ages.iloc[k,4]
                
for i in range(len(test.index)):
    if pd.isna(test.iloc[i,4]) == True:
        for k in range(len(ages.index)):
            if test.iloc[i,1] == ages.iloc[k,0] and test.iloc[i,3] == ages.iloc[k,1] and test.iloc[i,5] == ages.iloc[k,2] and test.iloc[i,10] == ages.iloc[k,3]:
                test.iloc[i,4] = ages.iloc[k,4]

**Imputing fare**

In [None]:
fare = all_data.groupby(['Pclass', 'Sex', 'Embarked']).agg({'Fare': 'mean'}).reset_index()

for i in range(len(train.index)):
    if pd.isna(train.iloc[i,9]) == True:
        for k in range(len(fare.index)):
            if train.iloc[i,2] == fare.iloc[k,0] and train.iloc[i,4] == fare.iloc[k,1] and train.iloc[i,11] == fare.iloc[k,2]:
                train.iloc[i,9] = fare.iloc[k,3]
                
for i in range(len(test.index)):
    if pd.isna(test.iloc[i,8]) == True:
        for k in range(len(fare.index)):
            if test.iloc[i,1] == fare.iloc[k,0] and test.iloc[i,3] == fare.iloc[k,1] and test.iloc[i,10] == fare.iloc[k,2]:
                test.iloc[i,8] = fare.iloc[k,3]

**Delete unnecessary columns**

In [None]:
train = train.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis = 1)
test = test.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis = 1)

# EDA

**Distribution of survivors**

In [None]:
fig = px.pie(train['Survived'].value_counts().reset_index(), values = 'Survived', names = ['Not survived', 'Survived'],
                 width = 600, height = 600)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.78, 
                  marker = dict(colors = ['#A01D26','#ACBEBE'], line = dict(color = 'white', width = 2)))

fig.update_layout(title_text = 'Survivors', title_x = 0.5, title_y = 0.53, title_font_size = 32, title_font_family = 'Calibri Black', title_font_color = 'black',
                  showlegend = False)
                  
fig.show()

**Affect of class, gender and embarked on survival**

In [None]:
pclass = train.groupby(['Pclass', 'Survived']).agg({'Survived': 'count'}).rename(columns = {'Survived': 'count'}).reset_index()
sex = train.groupby(['Sex', 'Survived']).agg({'Survived': 'count'}).rename(columns = {'Survived': 'count'}).reset_index()
embarked = train.groupby(['Embarked', 'Survived']).agg({'Survived': 'count'}).rename(columns = {'Survived': 'count'}).reset_index()

def percent(data):
    data['percent'] = 0
    for i in range(len(data.index)):
        if data.index[i] % 2 == 0:
            data.iloc[i, 3] = round((data.iloc[i, 2] / (data.iloc[i, 2] + data.iloc[i+1, 2])) * 100, 1)
        else:
            data.iloc[i, 3] = 100 - data.iloc[i-1, 3]
            
percent(pclass)
percent(sex)
percent(embarked)

pclass.iloc[[0,2,4], 1] = 'Not survived'
pclass.iloc[[1,3,5], 1] = 'Survived'

In [None]:
fig = plt.figure(figsize = (18, 18))
fig.patch.set_facecolor('#fafafa')

plt.subplot(321)
sns.set_style('white')
plt.title('Class', size = 20, x = 1.1, y = 1.03)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
a = sns.barplot(data = pclass, x = pclass['Pclass'], y = pclass['count'], hue = pclass['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in a.patches:
    height = p.get_height()
    a.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend(loc = 'upper left')


plt.subplot(322)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
aa = sns.barplot(data = pclass, x = pclass['Pclass'], y = pclass['percent'], hue = pclass['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in aa.patches:
    height = p.get_height()
    aa.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend('').set_visible(False)

plt.subplot(323)
plt.title('Gender', size = 20, x = 1.1, y = 1.03)
a2 = sns.barplot(data = sex, x = sex['Sex'], y = sex['count'], hue = sex['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in a2.patches:
    height = p.get_height()
    a2.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('').set_visible(False)

plt.subplot(324)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
aa2 = sns.barplot(data = sex, x = sex['Sex'], y = sex['percent'], hue = sex['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in aa2.patches:
    height = p.get_height()
    aa2.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend('').set_visible(False)

plt.subplot(325)
plt.title('Embarked', size = 20, x = 1.1, y = 1.03)
a3 = sns.barplot(data = embarked, x = embarked['Embarked'], y = embarked['count'], hue = embarked['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in a3.patches:
    height = p.get_height()
    a3.annotate(f'{height:g}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.legend('').set_visible(False)

plt.subplot(326)
plt.grid(color = 'gray', linestyle = ':', axis = 'y', zorder = 0,  dashes = (1,7))
aa3 = sns.barplot(data = embarked, x = embarked['Embarked'], y = embarked['percent'], hue = embarked['Survived'], palette = ['#A01D26','#ACBEBE'])
for p in aa3.patches:
    height = p.get_height()
    aa3.annotate(f'{height:g}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 10,
                   xytext = (0, 5), 
                   textcoords = 'offset points')
plt.ylabel('')
plt.xlabel('')
plt.legend('').set_visible(False)


plt.show()

**Affect of age, SibSp, parch and fare on survival**

In [None]:
fig = plt.figure(figsize = (18, 18))
fig.patch.set_facecolor('#fafafa')

plt.subplot(221)
sns.set_style('white')
plt.title('Age', size = 14)
sns.kdeplot(train.query('Survived == 0')['Age'], color = '#A01D26', shade = True, label = 'Not survived', alpha = 0.7)
sns.kdeplot(train.query('Survived == 1')['Age'], color = '#ACBEBE', shade = True, label = 'Survived', alpha = 0.7)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])
plt.legend(loc = 'upper left')

plt.subplot(222)
plt.title('SibSp', size = 14)
sns.kdeplot(train.query('Survived == 0')['SibSp'], color = '#A01D26', shade = True, label = 'Not survived', alpha = 0.7)
sns.kdeplot(train.query('Survived == 1')['SibSp'], color = '#ACBEBE', shade = True, label = 'Survived', alpha = 0.7)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.subplot(223)
plt.title('Parch', size = 14)
sns.kdeplot(train.query('Survived == 0')['Parch'], color = '#A01D26', shade = True, label = 'Not survived', alpha = 0.7)
sns.kdeplot(train.query('Survived == 1')['Parch'], color = '#ACBEBE', shade = True, label = 'Survived', alpha = 0.7)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.subplot(224)
plt.title('Fare', size = 14)
sns.kdeplot(train.query('Survived == 0')['Fare'], color = '#A01D26', shade = True, label = 'Not survived', alpha = 0.7)
sns.kdeplot(train.query('Survived == 1')['Fare'], color = '#ACBEBE', shade = True, label = 'Survived', alpha = 0.7)
plt.grid(color = 'gray', linestyle = ':', axis = 'x', zorder = 0,  dashes = (1,7))
plt.ylabel('')
plt.xlabel('')
plt.yticks([])

plt.show()

**Distribution of male and female survival rates by class**

In [None]:
suv = train.groupby(['Survived', 'Sex', 'Pclass']).agg({'Survived': 'count'}).rename(columns = {'Survived': 'count'}).reset_index()
suv.iloc[0:6, 0] = 'Not survived'
suv.iloc[6:, 0] = 'Survived'
for i in range(len(suv.index)):
    suv.iloc[i,2] = str(suv.iloc[i,2]) + ' class'

fig = px.sunburst(suv, path = ['Survived', 'Sex', 'Pclass'], values = 'count', color = 'Survived',
                 color_discrete_map = {'Not survived': '#A01D26', 'Survived': '#ACBEBE'},
                 width = 700, height = 700)

fig.update_layout(annotations = [dict(text = 'Distribution of male and female survival rates by class', 
                                      x = 0.5, y = 1.1, font_size = 24, showarrow = False, 
                                      font_family = 'Calibri Black',
                                      font_color = 'black')])

fig.update_traces(textinfo = 'label + percent parent')
                  
fig.show()

In [None]:
matrix = np.triu(train.corr())
palette = ['#ACBEBE', '#A01D26']
plt.figure(figsize=(13, 10))
sns.heatmap(train.corr(), annot = True, cmap = palette, fmt=".2f", mask = matrix,
            vmin = -1, vmax = 1, linewidths = 0.1, linecolor = 'white', cbar = False)
plt.show()

# Conclusions of EDA

1. As in the original Titanic dataset, the most important factors affecting survival are the class and gender of the passenger. 
2. Also a great importance have a port of embarkation. 
3. Age also affects survival, but not much. 
4. Count of of siblings / spouses / parents / children have almost no effect on survival.

# Prepare for modeling

In [None]:
X = train.drop(['Survived'], axis = 1)
y = train['Survived']

num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked']

def label_encoder(df):
    for i in cat_cols:
        le = LabelEncoder()
        df[i] = le.fit_transform(df[i])
    return df

sc = StandardScaler()
X[num_cols] = sc.fit_transform(X[num_cols])
test[num_cols] = sc.fit_transform(test[num_cols])

X = label_encoder(X)
test = label_encoder(test)

for i in ['Pclass', 'Sex', 'Embarked']:
    X[i] = X[i].astype('category')
    test[i] = test[i].astype('category')

X.head()

# Modeling 1

For modeling 1 I will use LGBM tuned with Optuna (100 trials)

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 2021)

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
        'cat_feature': ['Pclass', 'Sex', 'Embarked'],
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'n_estimators': 10000,
        'random_state': 2021,
        'metric': 'binary_logloss'
    }
    
    model = LGBMClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 100, verbose = False)
    y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

paramsLGBM = study.best_trial.params
params = {'n_estimators': 10000, 'random_state': 2021, 'metric': 'binary_logloss', 'cat_feature': ['Pclass', 'Sex', 'Embarked']}
paramsLGBM.update(params)

folds = KFold(n_splits = 10, shuffle = True, random_state = 2021)
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**paramsLGBM)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 100)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
sns.histplot(predictions)
plt.show()

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.5, 1, 0)})
submission.to_csv('submissionLGBM.csv', index = False)
submission.head()

Result - 0.78945 (from past notebook version)

Let's try to change threshold

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.45, 1, 0)})
submission.to_csv('submissionLGBM2.csv', index = False)
submission.head()

Result - 0.79450 (from past notebook version)

# Modeling 2

For modeling 2 I try to use tuned with Optuna (30 minutes) LGBM with cross-validation (10 folds)

In [None]:
pin_params = {'n_estimators': 10000, 'learning_rate': 0.05, 'metric': 'binary_logloss', 'cat_feature': ['Pclass', 'Sex', 'Embarked']}

def objective(trial):

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200)
    }

    params.update(pin_params)
    
    model = LGBMClassifier(**params) 
    scores = []
    k = KFold(n_splits = 10, shuffle = True, random_state = 2021)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        print(f"\n----- FOLD {i} -----")
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X, y, eval_set = [(X_val, y_val)], early_stopping_rounds = 25, verbose = 1000)
        
        tr_preds = model.predict(X_train)
        tr_score = accuracy_score(y_train, tr_preds)
        
        val_preds = model.predict(X_val)
        val_score = accuracy_score(y_val, val_preds)

        scores.append((tr_score, val_score))
        
        print(f"Fold {i} | Accuracy: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, timeout = 1800)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
pin_params.update(study.best_params)

model = LGBMClassifier(**pin_params) 
predictions = np.zeros(len(test))
k = KFold(n_splits = 10, shuffle = True, random_state = 2021)
for i, (trn_idx, val_idx) in enumerate(k.split(X, y)):
    print(f"\n----- FOLD {i} -----")
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model.fit(X, y, eval_set = [(X_val, y_val)], early_stopping_rounds = 25, verbose = 1000)
    predictions += model.predict_proba(test)[:,1] / k.n_splits

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.5, 1, 0)})
submission.to_csv('submissionLGBM3.csv', index = False)
submission.head()

result - 0.77387. It's sad :(

(from past notebook version)

# Modeling 3

For modeling 3 I try to use tuned XGB with Optuna (100 trials)

In [None]:
for i in ['Pclass', 'Sex', 'Embarked']:
    X[i] = X[i].astype('int')
    test[i] = test[i].astype('int')

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 2021)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6),
        'max_bin': trial.suggest_int('max_bin', 50, 500),
        'n_estimators': 10000,
        'random_state': 2021,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss'

    }
    
    model = XGBClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 100, verbose = False)
    y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
paramsXGB = study.best_trial.params
params = {'n_estimators': 10000, 'random_state': 2021}
paramsXGB.update(params)

folds = KFold(n_splits = 10, shuffle = True, random_state = 2021)
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBClassifier(**paramsXGB)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'logloss', verbose = False, early_stopping_rounds = 100)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
sns.histplot(predictions)
plt.show()

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.5, 1, 0)})
submission.to_csv('submissionXGB.csv', index = False)
submission.head()

Result - 0.79288 (from past notebook version)

Let's change threshold

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.45, 1, 0)})
submission.to_csv('submissionXGB2.csv', index = False)
submission.head()

Result - 0.79591 (from past notebook version)

# Random Forest (why not?)

In [None]:
rf = RandomForestClassifier(random_state = 2021)

params = { 
    'n_estimators': [200, 500, 1000, 2000, 5000],
    'max_depth' : range(3,8)
}

CV_rf = GridSearchCV(estimator = rf, param_grid = params, cv = 5, scoring = 'accuracy')
CV_rf.fit(X, y)

In [None]:
best_rf = CV_rf.best_estimator_
best_rf

In [None]:
predictions = best_rf.predict_proba(test)[:,1]
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.5, 1, 0)})
submission.to_csv('submissionRF.csv', index = False)
submission.head()

Result - 0.78961

Let's try to change threshold

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.45, 1, 0)})
submission.to_csv('submissionRF2.csv', index = False)

submission = pd.DataFrame({'PassengerId': ID, 'Survived': np.where(predictions > 0.55, 1, 0)})
submission.to_csv('submissionRF3.csv', index = False)

Results - 0.79401 and 0.77871

# AutoML

According to recent observations, I have noticed that many masters successfully use AutoML, so I'll try to keep up.

First, let's try to create new features. For this I repeat my preproceesing without deleting columns (except PassengerId and Name).

In [None]:
train['Cabin'] = train['Cabin'].map(lambda x: str(x)[0].strip())
test['Cabin'] = test['Cabin'].map(lambda x: str(x)[0].strip())

train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

train['is_alone'] = train['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
test['is_alone'] = test['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

train['ticket_type'] = 0
for i in range(len(train)):
    if pd.isna(train.iloc[i,8]) == True:
        train.iloc[i,14] = 'Na'
    elif train.iloc[i,8].isdigit() == True:
        train.iloc[i,14] = 'N'
    else:
        train.iloc[i,14] = train.iloc[i,8].split(' ')[0]
        
test['ticket_type'] = 0
for i in range(len(test)):
    if pd.isna(test.iloc[i,7]) == True:
        test.iloc[i,13] = 'Na'
    elif test.iloc[i,7].isdigit() == True:
        test.iloc[i,13] = 'N'
    else:
        test.iloc[i,13] = test.iloc[i,7].split(' ')[0]

In [None]:
train = train.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)
test = test.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)

In [None]:
!pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
import torch

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 10 # folds cnt for AutoML
RANDOM_STATE = 2021 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
# Task
def acc_score(y_true, y_pred, **kwargs):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

task = Task('binary', metric = 'logloss')

# Column role
roles = {'target': 'Survived'}

In [None]:
%%time
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train, roles = roles)
print(f'oof_pred:\n{oof_pred[:10]}\nShape = {oof_pred.shape}')

In [None]:
predictions = automl.predict(test)
print(f'Prediction for test data:\n{predictions[:10]}\nShape = {predictions.shape}')

print('Check scores...')
print('OOF score: {}'.format(acc_score(train['Survived'].values, oof_pred.data[:, 0])))

In [None]:
submission = pd.DataFrame({'PassengerId': ID, 'Survived': (predictions.data[:, 0] > 0.5).astype(int)})
submission.to_csv('submissionAutoML.csv', index = False)
submission.head()

Result - 0.79151