In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
test_data = pd.read_csv("../input/titanic/test.csv")
train_data = pd.read_csv("../input/titanic/train.csv")

In [None]:
train_data["Survived"].sum()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
test_data.info()

In [None]:
test_data.describe()

Explore missing values - looks like we"ll have to impute the age. Embarked only has two missing values so we can impute this also.

In [None]:
train_data.isna().sum()

In [None]:
test_data.isna().sum()

Explore the features available out of the box and their impact on survival rate

In [None]:
train_data.groupby("Embarked").mean()

In [None]:
train_data.groupby("Sex").mean()

In [None]:
train_data.groupby("Pclass").mean()

In [None]:
train_data.groupby("SibSp").mean()

In [None]:
train_data.groupby("Parch").mean()

Add some additional features - 

First extract the title from the name - could be an indicator of social status, which may be relevant to survival rate. We already know that sex is a huge influencer - but the rarer titles may also influence the outcome

In [None]:
data = [train_data, test_data]

for dataset in data:
    dataset["Title"] = dataset["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    dataset["Title"] = dataset["Title"].replace(["Lady", "Countess", "Capt", "Col", "Don", "Dr","Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")
    dataset["Title"] = dataset["Title"].replace("Mlle", "Miss")
    dataset["Title"] = dataset["Title"].replace("Ms", "Miss")
    dataset["Title"] = dataset["Title"].replace("Mme", "Mrs")
    dataset["Title"] = dataset["Title"].fillna("Unknown")

In [None]:
sns.catplot(data=train_data, x="Title", y="Survived", aspect=2.5, kind="bar")

Use the Parch & SibSp features to count the number of people travelling - also add a flag for travelling alone or not - as per below visualisations these appear to influence the surival rate

In [None]:
datasets = [train_data, test_data]

for dataset in datasets:
    dataset["Relatives"] = dataset["Parch"] + dataset["SibSp"]
    dataset.loc[dataset["Relatives"] == 0, "TravelledAlone"] = "Yes"
    dataset.loc[dataset["Relatives"] > 0, "TravelledAlone"] = "No"
    
train_data.groupby("TravelledAlone").mean()

In [None]:
sns.catplot(data=train_data, x="Relatives", y="Survived", aspect=2.5, kind="bar")

Travelling alone, if you were male, had a negative impact on your chances of survival - women however survived more often when travelling alone!

In [None]:
sns.catplot(data=train_data, x="TravelledAlone", y="Survived", kind="bar", hue="Sex")

Strip out the first character from the Cabin feature - this appears to be the deck - could be relevant

In [None]:
train_data["Deck"] = train_data["Cabin"].str.slice(stop=1)
test_data["Deck"] = train_data["Cabin"].str.slice(stop=1)

In [None]:
train_data["Deck"].value_counts()

In [None]:
sns.catplot(data=train_data, x="Deck", y="Survived", kind="bar", aspect=2.5)

In [None]:
sns.catplot(data=train_data, x="Pclass", y="Survived", kind="bar", aspect=2.5)

Dropping the original features that we have already engineered.

In [None]:
train_data.drop(["Name", "Ticket", "SibSp", "Parch", "Cabin"], axis=1, inplace=True)
test_data.drop(["Name", "Ticket", "SibSp", "Parch", "Cabin"], axis=1, inplace=True)

In [None]:
train_data.isna().sum()

Define a pipeline (probably not needed - this is just for my own practice)

In [None]:
y = train_data["Survived"]
train_data.drop("Survived", axis=1, inplace=True)

In [None]:
numerical_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ["float64", "int64"]]
categorical_cols = [cname for cname in train_data.columns if train_data[cname].dtype == "object"]

numerical_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

In [None]:
def score_model(train, target, n_estimators, max_depth):
    my_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
    my_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", my_model)
    ])

    cv_scores = cross_val_score(my_pipeline, train, target, cv=2, scoring="accuracy")
    
    return cv_scores

In [None]:
n_estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

scores = {str(i): score_model(train_data, y, i, 5).mean() for i in n_estimators}


In [None]:
scores

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(list(scores.keys()), list(scores.values()))
plt.show()

In [None]:
my_final_model = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=0)

my_final_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", my_final_model)
])

In [None]:
my_final_pipeline.fit(train_data, y)
preds = my_final_pipeline.predict(test_data)

In [None]:
output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": preds})
output.to_csv("my_submission.csv", index=False)