<img src="https://faithmag.com/sites/default/files/styles/article_full/public/2018-09/titanic2.jpg?h=6521bd5e&itok=H8td6QVv" alt="drawing" height="600" width="600"/>

# **Titanic dataset is one of the best known for those who start their journey with ML.**
### In this notebook, I will show you an easy approach to obtain nearly 80%. This score may be easily improved by a couple of methods which I will mention at the end. The purpose of this work is not to find the best possible solution for this problem but a simple one that performs decently.

## Please upvote my work if you find it helpful. Happy reading :)

In [None]:
import os
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_path = '/kaggle/input/tabular-playground-series-apr-2021/'
train_path = os.path.join(base_path, 'train.csv')
test_path = os.path.join(base_path, 'test.csv')
sample_submission_path = os.path.join(base_path, 'sample_submission.csv')

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

### After initial data inspection it seems like Name, PassengerId columns won't be needed. Cabin and Ticket columns need to be transformed to bu useful.

In [None]:
cols_to_drop = ['Name', 'PassengerId']

train.drop(cols_to_drop, axis=1, inplace=True)

# keep ids for submission 
test_indexes = test['PassengerId']
test.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
target_col = 'Survived'

categorical_cols = list(train.loc[:,train.dtypes == "object"].columns)
numerical_cols = list(train.loc[:,train.dtypes != "object"].columns)

numerical_cols.remove(target_col)

In [None]:
train.Cabin.fillna('N', inplace=True)
test.Cabin.fillna('N', inplace=True)

train.Cabin = train.Cabin.map(lambda x: x[0])
test.Cabin = test.Cabin.map(lambda x: x[0])


train.Ticket = train.Ticket.fillna('N').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'N')
test.Ticket = test.Ticket.fillna('N').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'N')

In [None]:
train.shape

### Let's plot categorical and descrete values data with respect to Survived

In [None]:
axes = []
cols = list(categorical_cols)
cols.extend(['Pclass', 'SibSp', 'Parch'])
cols.remove('Ticket')
print(cols)
num_of_cols = len(cols)
fig = plt.figure(figsize=(12, 12 * num_of_cols))

for i in range(num_of_cols):    
    plot = train[cols[i]].value_counts()
  
    ax = fig.add_subplot(num_of_cols, 1, i + 1)
    axes.append(ax)
    
    sns.countplot(data=train, x=cols[i], hue=target_col, ax=ax)
    ax.legend()

### As expected sex seems to have a big impact. Embarked, Cabin and Pclass also give some valueble insight. Looking into SibSp and Parth columns it is hard to evaluate because of not equaly distributed labels(~45% survived and ~55% not survived)
### Now lets inspect numerical values with respect to the target value

In [None]:
def plot_face_grid(x):
    g = sns.FacetGrid(train, col=target_col, height=6)
    g.map(sns.kdeplot, x, shade=True).add_legend()

In [None]:
plot_face_grid('Age')

### It seems like older people were more likely to survive compering to young adults

In [None]:
plot_face_grid('Fare')

### As on the previous chart, data seems to be a bit skew to the right

In [None]:
# Apply Chi-Squared test on the next verison of the notebook

### Now let's prepare data for the modeling
### You may experiment with different strategies for SimpleImputer and with Scalers

In [None]:
def remove_target(data, target):
    ret = data[target]
    data.drop([target], axis=1, inplace=True)
    return ret

In [None]:
from sklearn.preprocessing import LabelEncoder
class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

train_y = remove_target(train, target_col)


categorical_cols.remove('Ticket')
label_cols = ['Ticket']

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),#, fill_value='None')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
    ])

lab_pipeline = Pipeline([
    ('label_encoder', ModifiedLabelEncoder())
])

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols),
        ('lab', lab_pipeline, label_cols)
    ])
    

train = full_pipeline.fit_transform(train, train_y)
test = full_pipeline.fit_transform(test)

In [None]:
train.shape

# Modle selection

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate


from scipy.stats import uniform
from random import randint

np.random.seed(123)

models = [
    (CatBoostClassifier(verbose=False), 'CatBoost'),
    (AdaBoostClassifier(), 'AdaBoost'),
    (RandomForestClassifier(), 'RandomForest'),
    (GaussianNB(), 'NB'),
    (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
    (LogisticRegression(max_iter=600), 'LogisticRegression'),
    (KNeighborsClassifier(), 'KNeighbors'),
    (XGBClassifier(use_label_encoder=False), 'XGB'),
    (LGBMClassifier(), 'LGBM')
]

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def train_and_evaluate(model, x, y):
    model, name = model
    results = cross_validate(model, x, y)
    results = results['test_score']
    print(name)
    print(results)
    print('mean: ', results.mean())
    print('std: ', results.std(), '\n')

In [None]:
for model in models:
    train_and_evaluate(model, train, train_y)

### As we can see, catboost and LGBM performs the best. We will focus on finding best parameters for the second one. If you want to get better result you may increase n_iters to try bigger amount of combinations.

In [None]:
parameters = {
    "n_estimators": list(range(20, 500)),
    "learning_rate": uniform(0.001, 0.199),
    "max_depth": [-1, 2, 4, 8, 16],
    "min_data_per_group": list(range(2,1000)),
    "num_leaves": list(range(2, 200)),
    "bagging_freq": list(range(1, 8)),
    "max_bin": list(range(2, 200)),
    "lambda_l1": uniform(0.001, 0.199),
    "lambda_l2": uniform(0.001, 0.199),
    "feature_fraction" : [i/100 if i/100 < 1 else 1. for i in range(71,120)],
    "bagging_fraction" : uniform(0.01, 0.99)
    } 

model = LGBMClassifier(extra_trees=True, verbose=-1)
randomsearch = RandomizedSearchCV(model,
                                  param_distributions=parameters,
                                  n_iter=25,
                                  cv=4,
                                  random_state=1,
                                  scoring='accuracy',
                                  return_train_score=True,
                                  refit='Accuracy',
                                  )

r = randomsearch.fit(train,train_y)
scores = r.cv_results_
best = r.best_estimator_


In [None]:
# print all models tried with the parameters used and gained performace
for mean_score, params in sorted(list(zip(scores["mean_test_score"], scores["params"])), key = lambda x: x[0]):
    print(mean_score, params)

### The best result I got has accuracy slightly above 78.2%. Fortunately, because of cross-validation, we did not overfit and it gives us almost 80% for the submission


In [None]:
result = best.predict(test)

In [None]:
df = pd.DataFrame(zip(test_indexes, result))
df.columns = ['PassengerId', 'Survived']
df.set_index('PassengerId', inplace=True)
df.head()

In [None]:
df.to_csv('submission.csv')

## Summary

### I hope you liked this notebook. If you have any suggestions, please share them in the comments.

### List of the todo things to improve final accuracy:
- Try pseudo labels. This works really well on this dataset. It may increase your final accuracy by > 1% (You first train the initial model to predict labels for the test set and later on you use a combined test and training set for training the final model) 
- Try Stacking. (Train many different models and use the probabilities returned by them as an input to the final decision model, read more here https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html.
- Build the NN.
- Experiment with feature engineering. 