This notebook was written as part of a Machine Learning livestream (in Arabic), on the Al Fihriya Academy channel.
Feel free to [watch the livestream](https://www.youtube.com/watch?v=GDD0-7FUG4s) or [join us on facebook](https://www.facebook.com/groups/1145703169114621).

For reference, this is the Titanic's layout (to motivate some of the analysis discussed below).

![](https://www.encyclopedia-titanica.org/files/1/figure-one-side-view-l.gif)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (12, 9)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')

In [None]:
df.head(10)

In [None]:
df.Ticket.sample(20)

In [None]:
t_df = df.groupby('Ticket').Survived.agg(['count', 'mean']).sort_values('count', ascending=False)
t_df

In [None]:
df['FamilyName'] = df.Name.apply(lambda x : x.split(',')[0])
df['FirstName'] = df.Name.apply(lambda x : x.split(',')[1])

t_df = df.groupby('FamilyName').Survived.agg(['count', 'mean']).sort_values('count', ascending=False)
t_df[(t_df['count'] > 1) & (t_df['count'] < 10)]

In [None]:
t_df = df.groupby('FirstName').Survived.agg(['count', 'mean']).sort_values('count', ascending=False)
t_df[(t_df['count'] > 1) & (t_df['count'] < 20)]

In [None]:
df.groupby('Sex').Survived.mean()

In [None]:
import seaborn as sns
sns.violinplot(x='Survived', y='Age', data=df)

In [None]:
df.groupby('Pclass').Survived.mean()

In [None]:
df[df.Parch > 0].Survived.mean()

In [None]:
df[df.Parch == 0].Survived.mean()

In [None]:
df.corr()[['Survived']].T

In [None]:
df[df.SibSp > 2].Survived.mean()

In [None]:
df[df.SibSp == 0].Survived.mean()

In [None]:
df.groupby('Embarked').Survived.mean().to_frame()

In [None]:
df['CabinLetter'] = df.Cabin.apply(lambda x : x[0] if not pd.isna(x) else x)
df.groupby('CabinLetter').Survived.agg(['count', 'mean'])

## Pre-processing.

In [None]:
import re
def extract_ticket_prefix(ticket):
    if pd.isna(ticket):
        return ticket
    first_digit_search = re.search(r'\d', ticket)
    if first_digit_search:
        return ticket[:first_digit_search.span()[0]].strip()
    return None

def preprocess_dataframe(df):
    df = df.copy()
    df['CabinLetter'] = df.Cabin.apply(lambda v : v if pd.isna(v) else v[0])
    df['FamilyId'] = df.Name.str.lower().str.split(', ').str[0] + df.CabinLetter + df.Embarked + df.Pclass.astype(str)
    df['TicketPrefix'] = df.Ticket.apply(extract_ticket_prefix)
    df['FirstName'] = df.Name.str.split(' ').apply(lambda vs : vs[1].lower())
    df['Pclass'] = df['Pclass'].astype(str)
    return df

df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
df = preprocess_dataframe(df)
df.head(10)

## Train the Survival Prediction Model.

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.25, random_state=0)
df_train.sample(5)

In [None]:
df_test.shape

In [None]:
NUMERIC_FEATURES = ['Age', 'SibSp', 'Parch', 'Fare']
num_df = df_train[NUMERIC_FEATURES].copy()
num_df.loc[num_df['Age'].isna(), 'Age'] = num_df['Age'].mean()
num_df

In [None]:
from sklearn.preprocessing import OneHotEncoder

CATEGORICAL_FEATURES = ['Pclass', 'Sex', 'Embarked', 'CabinLetter']
cat_df = df_train[CATEGORICAL_FEATURES].copy()
cat_df

In [None]:
one_hot_encoder = OneHotEncoder(drop='first')
cat_features = one_hot_encoder.fit_transform(cat_df).todense()
pd.DataFrame(cat_features, columns=one_hot_encoder.get_feature_names())

In [None]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns)
scaled_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df_train, df_test = train_test_split(df, test_size=0.25, random_state=0)


def fit_transformers(df_train):
    one_hot_encoder = OneHotEncoder(drop='first')
    one_hot_encoder.fit(df_train[CATEGORICAL_FEATURES])
    scaler = StandardScaler()
    scaler.fit(compute_features(df_train, one_hot_encoder, scaler=None))
    return scaler, one_hot_encoder

def compute_features(df, one_hot_encoder, scaler):
    df = preprocess_dataframe(df)
    cat_features = one_hot_encoder.transform(df[CATEGORICAL_FEATURES])
    cat_df = pd.DataFrame(cat_features.todense(), columns=one_hot_encoder.get_feature_names()).reset_index(drop=True)
    num_df = df[NUMERIC_FEATURES].reset_index(drop=True)
    num_df['Age'] = num_df['Age'].fillna(38.0)
    num_df['Fare'] = num_df['Fare'].fillna(44)
    features = pd.concat([num_df, cat_df], axis=1)
    if scaler:
        features = pd.DataFrame(scaler.transform(features), columns=features.columns)
    return features

scaler, one_hot_encoder = fit_transformers(df_train)

X_train = compute_features(df_train, one_hot_encoder, scaler)
y_train = df_train.Survived

X_test = compute_features(df_test, one_hot_encoder, scaler)
y_test = df_test.Survived

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

model = XGBClassifier()
baseline = DummyClassifier(strategy='most_frequent')

model.fit(X_train, y_train)
baseline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, model.predict(X_test)))

In [None]:
print(classification_report(y_test, baseline.predict(X_test), zero_division=0))

In [None]:
model.predict(X_test)

## Submission.

In [None]:
# Train the model on all training data.
sub_train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
sub_train_df = preprocess_dataframe(sub_train_df)
scaler, one_hot_encoder = fit_transformers(sub_train_df)

X_train = compute_features(sub_train_df, one_hot_encoder, scaler)
y_train = sub_train_df.Survived

model.fit(X_train, y_train)

In [None]:
# Run inference on the test dataset and create a submission.
sub_test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
X_test = compute_features(sub_test_df, one_hot_encoder, scaler)

submission_df = sub_test_df[['PassengerId']].copy()
submission_df['Survived'] = model.predict(X_test)
submission_df.head(10)

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df

In [None]:
!ls