![Titanic](https://www.encyclopedia-titanica.org/ezoimgfmt/titanica.org/images/titanic-intro-header-1.jpg?ezimgfmt=ng%3Awebp%2Fngcb30%2Frs%3Adevice%2Frscb30-2)

# TPS APRIL 2021: The Titanic

This competition uses the Titanic dataset and tries to predict the survival of the passengers.<br/>
It is one of the most popular Machine Learning problems and there are numerous notebook analyzing the dataset.

This notebook is a quick implementation of some effective algorithms and techniques in Machine Learning.

### Importing the libraries

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime
from sklearn.preprocessing import  MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import  cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

from warnings import simplefilter
simplefilter('ignore')


### Importing the dataset

In [None]:
train= pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test= pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

## Pre-procesing

Let's see the percentage of missing values.

In [None]:
print(train.isna().sum()/len(train)*100)

* Fare will be filled with the mean as less than 1% is missing.
* Cabin, Embarked and Ticket will be filled with inexpressive values as they are categorical values and impossible to guess.
* Age is an important feature, we see how to fill its missing values.

In [None]:
# f=plt.figure(figsize=(18,12))
# ax1=f.add_subplot(2,3,1)
# sns.boxplot(train['Sex'],train['Age'])

# ax2=f.add_subplot(2,3,2)
# sns.boxplot(train['Pclass'],train['Age'])

# ax3=f.add_subplot(2,3,3)
# sns.boxplot(train['Embarked'],train['Age'])

# ax4=f.add_subplot(2,2,3)
# sns.heatmap(train.corr(),annot=True)

Age is going to be filled according to Pclass as it is the most important correlated attribute and offers 3 classe with distinctive age average, as the plots show.


In [None]:
def manip(train,training=False ):  
    age_filling = train[['Age', 'Sex']].dropna().groupby('Sex').mean().to_dict()
    train['Age'] = train.Age.fillna(train['Sex'].map(age_filling['Age']))   
    train['Cabin'] = train['Cabin'].fillna('N').map(lambda x: x[0].strip())
    train['Ticket'] = train['Ticket'].fillna('N').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
    train['Embarked'] = train['Embarked'].fillna('N')
    train['Name'] = train['Name'].apply(lambda x: str.split(x, ',')[0])
    le=LabelEncoder()
    train['Name']=le.fit_transform(train['Name'])
    train['Ticket']=le.fit_transform(train['Ticket'])
    train.drop(columns=['PassengerId'],inplace=True)
        
    if training== False:    
        train=pd.get_dummies(train[['Sex','Age','Pclass','SibSp','Parch','Embarked','Fare',
                                    'Name','Ticket','Cabin']]) 
    
    else:
        train=pd.get_dummies(train[['Sex','Age','Pclass','SibSp','Parch','Embarked','Fare',
                                    'Name','Ticket','Cabin','Survived']])
        
    train.dropna(inplace=True)

    return train

We prepare the data for the machine learning step, we split and scale data.

In [None]:
train=manip(train, training=True)
test=manip(test,training=False)


In [None]:
y=train['Survived']

X_train=train.drop(columns='Survived')[:80000]
X_test= train.drop(columns='Survived')[80000:]
y_train=y[:80000]
y_test=y[80000:]

# Modeling

Defining objective for Optuna hyperparameter tuning.

In [None]:
def objective_rf(trial):
    n_estimators= trial.suggest_int('n_estimators',10,500)
    max_depth=trial.suggest_int('max_depth',10,30)
    random_state=trial.suggest_int('random_state',42,123)
    
    rf= RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,random_state=random_state)
    return cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=5).mean()

    
def objective_lgb(trial):
    n_estimators= trial.suggest_int('n_estimators',10,2000)
    max_depth=trial.suggest_int('max_depth',10,30)
    random_state=trial.suggest_int('random_state',42,123)
    learning_rate= trial.suggest_loguniform('learning_rate', 0.001,10)
    num_leaves=trial.suggest_int('num_leaves',40,100)
    
    lgb= LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth,random_state=random_state,learning_rate=learning_rate)
    return cross_val_score(lgb, X_train, y_train, n_jobs=-1, cv=5).mean()

    
def objective_log(trial):
    C= trial.suggest_loguniform('C',0.1,10)
    l1_ratio=trial.suggest_loguniform('l1_ratio',0.1,2)
    random_state=trial.suggest_int('random_state',42,123)

    log= LogisticRegression(l1_ratio=l1_ratio, C=C,random_state=random_state)
    return cross_val_score(log, X_train, y_train, n_jobs=-1, cv=5).mean()


def objective_cat(trial):
    learning_rate= trial.suggest_loguniform('learning_rate',0.001,0.5)
    iterations=trial.suggest_int('iterations',100,600)
    depth=trial.suggest_int('depth',5,13)
    
    cat= CatBoostClassifier(learning_rate=learning_rate, iterations=iterations,depth=depth,
                            loss_function='Logloss',eval_metric='AUC',bootstrap_type='Bayesian')
    return cross_val_score(cat, X_train, y_train, n_jobs=-1, cv=3).mean()

We do the trials according to the computation power of the algorithms.

In [None]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=6)

study_log = optuna.create_study(direction='maximize')
study_log.optimize(objective_log, n_trials=10)

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=10)

study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=6)

Fetching the best parameters.

In [None]:
rf_params=study_rf.best_trial.params
print ('best rf param:',rf_params )

log_params=study_log.best_trial.params
print ('\nbest log param:', log_params)

lgb_params=study_lgb.best_trial.params
print ('\nbest lgb param:', lgb_params)

cat_params=study_cat.best_trial.params
print ('\nbest cat param:',cat_params )

 Training the models

In [None]:
log=LogisticRegression(**log_params)
log.fit(X_train,y_train)

lgb= LGBMClassifier(**lgb_params)
lgb.fit(X_train,y_train,early_stopping_rounds=200, verbose=0, eval_set=[(X_train,y_train),(X_test,y_test)] )

rf = RandomForestClassifier(**rf_params)
rf.fit(X_train,y_train)

cat=CatBoostClassifier(**cat_params)
cat.fit(X_train,y_train, silent=True)


Printing the accuracy score

## Pseudo labeling and retraining with folds

In [None]:
train= pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')

test_f= pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

data=pd.concat([train,test_f])
test_f=manip(test_f,training=False)

data=manip(data,training=True)

X=data.drop(columns='Survived')[:80000]
y=data['Survived'][:80000]

X_test=data.drop(columns='Survived')[80000:]
y_test=data['Survived'][80000:] 

test=data.drop(columns='Survived')[:80000]



y_all = pd.DataFrame() #where all predicted data of every step will be kept
thresh = [] 
acc_thresh = []
best_cat_acc=0.75

kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=420)
for i, (train_i, val_i) in enumerate(kfold.split(X, y)):
    print("\n FOLD",i)
    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    X_val = X.iloc[val_i]
    y_val = y.iloc[val_i]

    clf = CatBoostClassifier(**cat_params)
    clf_fit = clf.fit(X_train,y_train,eval_set=[(X_train, y_train), (X_val, y_val)],silent=True)


    if clf.score(X_test,y_test)>best_cat_acc:
        best_cat_acc=clf.score(X_test,y_test)
        best_cat=clf

    y_proba = clf_fit.predict_proba(X_val)[:,1]
    acc = accuracy_score(y_val, np.where(y_proba>0.5, 1, 0))            


    threshs = np.arange(0, 1.0, 0.01)
    acc_scores = []

    for thresh in threshs:
        acc_scores.append(accuracy_score(y_val, [1 if m>thresh else 0 for m in y_proba]))

    accs = np.array(acc_scores)
    max_acc = accs.max() 
    max_acc_threshold =  threshs[accs.argmax()] 
    thresh = thresh + [max_acc_threshold]

    acc = accuracy_score(y_val,np.where(y_proba>max_acc_threshold,1,0)) 
    acc_thresh = acc_thresh + [acc]
    print("Accuracy:",acc)

    ypred_fold = pd.DataFrame({'fold'+str(i): np.where(best_cat.predict_proba(test_f)[:,1]>threshs[accs.argmax()],1,0)})        
    y_all = pd.concat([y_all, ypred_fold], axis=1)

ypred=np.where(y_all.sum(axis=1)/len(y_all.columns)>0.5,1,0)


In [None]:
test_f['Survived']=ypred
train=manip(train,training=True)
data=pd.concat([train,test_f]).reset_index(drop=True)

test_f= pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

test_f=manip(test_f,training=False)

X=data.drop(columns='Survived')[:150000]
y=data['Survived'][:150000]

X_test=data.drop(columns='Survived')[150000:]
y_test=data['Survived'][150000:] 

test=data.drop(columns='Survived')[:150000]

In [None]:
y_all = pd.DataFrame() #where all predicted data of every step will be kept
thresh = [] 
acc_thresh = []
best_lgb_acc=0.75

kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=420)
for i, (train_i, val_i) in enumerate(kfold.split(X, y)):
    print("\n FOLD",i)
    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    X_val = X.iloc[val_i]
    y_val = y.iloc[val_i]

    clf = LGBMClassifier(**lgb_params)
    clf_fit = clf.fit(X_train,y_train,eval_set=[(X_train, y_train), (X_val, y_val)],verbose=0,early_stopping_rounds=200)


    if clf.score(X_test,y_test)>best_lgb_acc:
        best_lgb_acc=clf.score(X_test,y_test)
        best_lgb=clf

    y_proba = clf_fit.predict_proba(X_val)[:,1]
    acc = accuracy_score(y_val, np.where(y_proba>0.5, 1, 0))            


    threshs = np.arange(0.0, 1.0, 0.01)
    acc_scores = []

    for thresh in threshs:
        acc_scores.append(accuracy_score(y_val, [1 if m>thresh else 0 for m in y_proba]))

    accs = np.array(acc_scores)
    max_acc = accs.max() 
    max_acc_threshold =  threshs[accs.argmax()] 
    thresh = thresh + [max_acc_threshold]

    acc = accuracy_score(y_val,np.where(y_proba>max_acc_threshold,1,0)) 
    acc_thresh = acc_thresh + [acc]
    print("Accuracy:",acc)

    ypred_fold = pd.DataFrame({'fold'+str(i): np.where(best_lgb.predict_proba(test_f)[:,1]>threshs[accs.argmax()],1,0)})        
    y_all = pd.concat([y_all, ypred_fold], axis=1)

ypred=np.where(y_all.sum(axis=1)/len(y_all.columns)>0.5,1,0)



In [None]:
test_f['Survived']=ypred
data=pd.concat([train,test_f]).reset_index(drop=True)

test_f= pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

test_f=manip(test_f,training=False)

X=data.drop(columns='Survived')[:150000]
y=data['Survived'][:150000]

X_test=data.drop(columns='Survived')[150000:]
y_test=data['Survived'][150000:] 

test=data.drop(columns='Survived')[:150000]



y_all = pd.DataFrame() #where all predicted data of every step will be kept
thresh = [] 
acc_thresh = []
best_cat_acc=0.75

kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=420)
for i, (train_i, val_i) in enumerate(kfold.split(X, y)):
    print("\n FOLD",i)
    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    X_val = X.iloc[val_i]
    y_val = y.iloc[val_i]

    clf = CatBoostClassifier(**cat_params)
    clf_fit = clf.fit(X_train,y_train,eval_set=[(X_train, y_train), (X_val, y_val)],silent=True)


    if clf.score(X_test,y_test)>best_cat_acc:
        best_cat_acc=clf.score(X_test,y_test)
        best_cat=clf

    y_proba = clf_fit.predict_proba(X_val)[:,1]
    acc = accuracy_score(y_val, np.where(y_proba>0.5, 1, 0))            


    threshs = np.arange(0, 1.0, 0.01)
    acc_scores = []

    for thresh in threshs:
        acc_scores.append(accuracy_score(y_val, [1 if m>thresh else 0 for m in y_proba]))

    accs = np.array(acc_scores)
    max_acc = accs.max() 
    max_acc_threshold =  threshs[accs.argmax()] 
    thresh = thresh + [max_acc_threshold]

    acc = accuracy_score(y_val,np.where(y_proba>max_acc_threshold,1,0)) 
    acc_thresh = acc_thresh + [acc]
    print("Accuracy:",acc)

    ypred_fold = pd.DataFrame({'fold'+str(i): np.where(best_cat.predict_proba(test_f)[:,1]>threshs[accs.argmax()],1,0)})        
    y_all = pd.concat([y_all, ypred_fold], axis=1)

ypred=np.where(y_all.sum(axis=1)/len(y_all.columns)>0.5,1,0)

In [None]:
#Voting
vote=VotingClassifier([('random forest',rf),('LightGBM',lgb),('Catboost',cat),('LogisticRegression',log),
                       ('Kfold Cat',best_cat),('Kfold LightGBM',best_lgb)],
                      voting='hard',n_jobs=-1,weights=[2,3,2,1,3,5])
vote.fit(X_train,y_train)


Using the previous models to build a vote model

In [None]:
print('Random Forest:',rf.score(X_test,y_test),
    '\nLogistic Regression:',log.score(X_test,y_test),
    '\nLightGBM:',lgb.score(X_test,y_test),
    '\nCatBoost:',cat.score(X_test,y_test),
    '\nVote:',vote.score(X_test, y_test),
    '\nKfold cat:',best_cat.score(X_test, y_test),
    '\nKfold LGBM:',best_lgb.score(X_test, y_test))

# Submitting predictions

In [None]:
#for submission
test= pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
test=manip(test,training=False)

sub['Survived']=vote.predict(test)
sub['Survived']=sub['Survived'].astype(int)
sub.to_csv('submission.csv', index=False)

In [None]:
sub['Survived']=ypred
sub['Survived']=sub['Survived'].astype(int)
sub.to_csv('submission2.csv', index=False)