# Credits
Based off:
https://www.kaggle.com/code/mviola/titanic-wcg-knns-ensemble-0-82775-top-1/notebook

Some features based from:
https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial/notebook

https://www.kaggle.com/code/konstantinmasich/titanic-0-82-0-83

I tried to achieve highest possible score, to do that I combined code from the highest scored notebooks (in Python) mentioned above and made some changes like soft-voting, more KNN and different parameters, more features.


# Imports

In [3]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Load data

In [4]:
train_data = pd.read_csv("../data/titanic/train.csv")
test_data = pd.read_csv("../data/titanic/test.csv")
data = pd.concat([train_data, test_data]).reset_index().drop(["index"], axis=1)

# Feature Engineering

Add married status

In [5]:
data["Title"] = data["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

data["Title"] = data["Title"].replace(["Mlle", "Ms"], "Mrs")
data["Married_Status"] = np.where(data["Title"] == "Mrs", 1, 0)

Fill missing age based on title

In [6]:
data["Title"] = data["Name"].str.extract("([A-Za-z]+)\.", expand=True)

# Replacing rare titles with more common ones
title_mapping = {
    "Mlle": "Miss",
    "Major": "Mr",
    "Col": "Mr",
    "Sir": "Mr",
    "Don": "Mr",
    "Mme": "Miss",
    "Jonkheer": "Mr",
    "Lady": "Mrs",
    "Capt": "Mr",
    "Countess": "Mrs",
    "Ms": "Miss",
    "Dona": "Mrs",
}
data.replace({"Title": title_mapping}, inplace=True)

titles = ["Dr", "Master", "Miss", "Mr", "Mrs", "Rev"]
for title in titles:
    group_median_age = data.groupby("Title")["Age"].median()[titles.index(title)]
    data.loc[(data["Age"].isnull()) & (data["Title"] == title), "Age"] = group_median_age

data.drop("Title", axis=1, inplace=True)

Group into deck groups based on cabin survival rate

In [7]:
data["Deck"] = data["Cabin"].apply(lambda s: s[0] if pd.notnull(s) else "M")

data["Deck"].replace("T", "M", inplace=True)
data["Deck"].replace(["B", "D", "E"], "BDE", inplace=True)
data["Deck"].replace(["C", "F"], "CF", inplace=True)
data["Deck"].replace(["A", "G"], "AG", inplace=True)

# Create woman-child groups model

Extract surnames

In [8]:
data["Surname"] = data["Name"].apply(lambda x: x.split(",")[0])

Create ticket ID

In [9]:
def transform_to_ticket_ID(row):
    row["Ticket_ID"] = f"{row.Pclass}-{row.Ticket[:-1]}-{row.Fare}-{row.Embarked}"
    return row


data = data.apply(transform_to_ticket_ID, axis="columns")

Create group id

In [10]:
def transform_to_group_ID(row):
    row["Group_ID"] = f"{row.Surname}-{row.Ticket_ID}"
    return row


data = data.apply(transform_to_group_ID, axis="columns")

Create woman, man, boy groups

In [11]:
data["Title"] = "man"
data.loc[data.Sex == "female", "Title"] = "woman"
data.loc[data["Name"].str.contains("Master"), "Title"] = "boy"

Assign noGroup to the rest of passengers

In [12]:
data.loc[data.Title == "man", "Group_ID"] = "no_group"

data["Group_Count"] = data.loc[data.Title != "man"].groupby("Group_ID")["Group_ID"].transform("count")

data.loc[data.Group_Count <= 1, "Group_ID"] = "no_group"

Assign women to group based on ticket if there is one

In [13]:
for i in range(0, 1309):
    if (data.loc[i, "Title"] != "man") & (data.loc[i, "Group_ID"] == "no_group"):
        data.loc[i, "Group_ID"] = data.loc[
            (data["Ticket_ID"] == data.loc[i, "Ticket_ID"]) & (data.Title != "man"), "Group_ID"
        ].iloc[0]

Calculate survival for groups

In [14]:
data["Group_Survival"] = (
    data.loc[(data.Title != "man") & (data.Group_ID != "no_group")]
    .groupby("Group_ID")
    .Survived.transform("mean")
)

Assign 0 to 3rd class groups

In [15]:
group_columns = ["PassengerId", "Survived", "Group_Survival", "Name", "Title", "Group_ID"]

# get the groups based on the surname
test_groups = set(data[891:1309].Group_ID.unique()) - set(data[0:891].Group_ID.unique())
data.loc[data.Group_ID.isin(test_groups), group_columns].sort_values(by="Group_ID")

data.loc[data.Group_ID.isin(test_groups), "Group_Survival"] = 0
data.loc[(data.Group_ID.isin(test_groups)) & (data.Pclass != 3), "Group_Survival"] = 1

Families prediction
set women and boy prediction to 1 unless they group survival is set to 0

In [16]:
data.loc[891:1308, "Survival_Prediction"] = 0
data.loc[891:1308, "Survival_Prediction"][(data.Sex == "female")] = 1
data.loc[891:1308, "Survival_Prediction"][
    (data.Sex == "female") & (data["Group_Survival"] == 0)
] = 0
data.loc[891:1308, "Survival_Prediction"][
    (data.Title == "boy") & (data["Group_Survival"] == 1)
] = 1

Adjust fare by count of given ticket

In [17]:
data["Adjusted_Fare"] = data["Fare"] / data.groupby("Ticket")["Ticket"].transform("count")

# Create Gender Based Survival Models

In [18]:
def get_gender_based_data(sex: str):
    return (
        data[0:891].loc[(data.Sex == sex) & (data.Group_Survival.isnull())],
        data[891:1309].loc[(data.Sex == sex) & (data.Group_Survival.isnull())],
    )

In [19]:
from typing import List


def create_preprocessor(numerical_columns: List[str], categorical_columns: List[str]):
    numerical_transformer = Pipeline(
        steps=[("imputer", SimpleImputer()), ("scaler", StandardScaler())]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_columns),
            ("cat", categorical_transformer, categorical_columns),
        ]
    )

    return preprocessor

## Male Survival Model

Get isolated data

In [20]:
train_male, test_male = get_gender_based_data("male")

Create preprocessor with given numerical and categorical columns

In [21]:
numerical_cols_m = ["Adjusted_Fare"]
categorical_cols_m = ["Pclass", "Embarked", "Married_Status"]
preprocessor_m = create_preprocessor(numerical_cols_m, categorical_cols_m)

Create ensemble of KNN's

In [22]:
m1 = KNeighborsClassifier(n_neighbors=1)
m2 = KNeighborsClassifier(n_neighbors=3)
m3 = KNeighborsClassifier(n_neighbors=5)
m4 = KNeighborsClassifier(n_neighbors=7)
m5 = KNeighborsClassifier(n_neighbors=16)

male_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor_m),
        ("voting", VotingClassifier([("m1", m1), ("m2", m2), ("m3", m3), ("m4", m4), ("m5", m5)], voting="soft")),
    ]
)

In [23]:
features_m = ["Adjusted_Fare", "Pclass", "Embarked", "Married_Status"]
y_m = train_male["Survived"]
X_m = train_male[features_m]

In [24]:
male_pipeline.fit(X_m, y_m)
learn_train_m = male_pipeline.predict(X_m)

In [25]:
X_test_m = test_male[features_m]
proba_m = male_pipeline.predict_proba(X_test_m)[:, 1]
predictions_m = np.where(proba_m > 0.5, 1, 0)

In [26]:
survived_m = test_male.loc[(predictions_m==1), ["Name"]]
print(f"Predicted that {len(survived_m)} men will survive:", survived_m, sep="\n")

Predicted that 42 men will survive:
                                    Name
904                 Howard, Mr. Benjamin
911               Rothschild, Mr. Martin
919              Brady, Mr. John Bertram
921         Louch, Mr. Charles Alexander
925             Mock, Mr. Philipp Edmund
930                        Hee, Mr. Ling
937             Chevre, Mr. Paul Romaine
941             Smith, Mr. Lucien Philip
959      Tucker, Mr. Gilbert Milligan Jr
969       Aldworth, Mr. Charles Augustus
985                  Birnbaum, Mr. Jakob
986           Tenglin, Mr. Gunnar Isidor
992                   Weisz, Mr. Leopold
1000                   Swane, Mr. George
1017            Brobeck, Mr. Karl Rudolf
1019                Bowenur, Mr. Solomon
1022           Gracie, Col. Archibald IV
1028                 Schmidt, Mr. August
1034          Beauchamp, Mr. Henry James
1040              Lahtinen, Rev. William
1055       Peruschitz, Rev. Joseph Maria
1068     Stengel, Mr. Charles Emil Henry
1071           McCrie

## Solo Female Survival Model

Get isolated data

In [27]:
train_female, test_female = get_gender_based_data("female")

Create preprocessor with given numerical and categorical columns

In [28]:
numerical_cols_f = ["Adjusted_Fare", "Age"]
categorical_cols_f = ["Pclass", "Deck", "Married_Status"]
preprocessor_f = create_preprocessor(numerical_cols_f, categorical_cols_f)

Create ensemble of KNN's

In [29]:
f1 = KNeighborsClassifier(n_neighbors=2)
f2 = KNeighborsClassifier(n_neighbors=5)
f3 = KNeighborsClassifier(n_neighbors=9)
f4 = KNeighborsClassifier(n_neighbors=10)
f5 = KNeighborsClassifier(n_neighbors=16)

female_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor_f),
        ("voting", VotingClassifier([("f1", f1), ("f2", f2), ("f3", f3), ("f4", f4), ("f5", f5)])),
    ]
)

In [30]:
features_f = ["Adjusted_Fare", "Pclass", "Deck", "Age", "Married_Status"]
y_f = train_female["Survived"]
X_f = train_female[features_f]

In [31]:
female_pipeline.fit(X_f, y_f)
learn_train_f = female_pipeline.predict(X_f)

In [32]:
X_test_f = test_female[features_f]
predictions_f = female_pipeline.predict(X_test_f)

In [33]:
died_f = test_female.loc[(predictions_f==0), ["Name"]]
print(f"Predicted that {len(died_f)} non-group women will not survive:", died_f, sep="\n")

Predicted that 8 non-group women will not survive:
                                        Name
897                     Connolly, Miss. Kate
963           Nieminen, Miss. Manta Josefina
977                       Barry, Miss. Julia
1097                McGowan, Miss. Katherine
1105  Andersson, Miss. Ida Augusta Margareta
1182  Daly, Miss. Margaret Marcella Maggie""
1204                     Carr, Miss. Jeannie
1303          Henriksson, Miss. Jenny Lovisa


# Save the output

Change the data according to the predicitons made by models

In [34]:
data.loc[891:1308, "Survival_Prediction"][
    (data.Sex == "female") & (data.Group_Survival.isnull())
] = predictions_f
data.loc[891:1308, "Survival_Prediction"][
    (data.Sex == "male") & (data.Group_Survival.isnull())
] = predictions_m

Generate output file

In [36]:
output = pd.DataFrame(
    {
        "PassengerId": data[891:1309].PassengerId,
        "Survived": data[891:1309].Survival_Prediction.astype("int"),
    }
)
# output.to_csv("/kaggle/working/submission.csv", index=False)

In [41]:
prediction = data[891:1309].Survival_Prediction.astype("int")

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
submission_a = pd.read_csv('../data/submission_a.csv')
del submission_a['PassengerId']

In [42]:
accuracy = accuracy_score(prediction, submission_a)

accuracy

0.7822966507177034