### Trialling TPOT

Having not used TPOT before, this could be a good opportunity to compare it to the model I produced after some basic EDA and modelling.

This is more of a draft/trial to see how TPOT performs and how much feature engineering is needed.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from tpot import TPOTClassifier

In [None]:
df = pd.read_csv("../data/raw/train.csv")

In [None]:
df.head()

Only applying basic imputation and cleaning from the main notebook:

In [None]:
target = df[['Survived']].rename(columns={'Survived': 'class'})
na_fill = {'Embarked': 'X', 'Cabin': 'X', 'Ticket': 'X'}
df.fillna(value=na_fill, inplace=True)
df['Cabin'] = df['Cabin'].map(lambda x: x[0].strip())
df['Ticket'] = df['Ticket'].map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

fare_map = df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
df['Fare'] = df['Fare'].fillna(df['Pclass'].map(fare_map['Fare']))
age_map = df[['Age', 'Sex']].dropna().groupby(['Sex']).median().to_dict()
df['Age'] = df['Age'].fillna(df['Sex'].map(age_map['Age']))

In [None]:
categorical = ['Pclass', 'Sex', 'Embarked', 'Cabin', 'Ticket']
numeric = ['Age', 'SibSp', 'Parch', 'Fare']

In [None]:
target.info()

In [None]:
df.info()

In [None]:
pre_process = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(), categorical)], n_jobs=-1)
prep = pre_process.fit_transform(df)

In [None]:
prep

In [None]:
target

Choice of `config_dict` has a huge impact - my GPU config may not be fully complete, or alternatively my GTX 1080 is showing its age!

In [None]:
pipeline_optimizer = TPOTClassifier(max_eval_time_mins=420, cv=3, early_stop=5,
                                    random_state=42, verbosity=3, config_dict='TPOT NN',
                                    scoring='f1', n_jobs=-1)
pipeline_optimizer.fit(prep, target)
model.fit(X_train, np.ravel(y_train))

Applies transformations to test set and prepares submission csv

In [None]:
file_out = "../data/inference/tpot.csv"
test_df = pd.read_csv("../data/raw/test.csv")
test_df.head()
test_features = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin']]
# test_features['FamilySize'] = test_features['SibSp'] + test_features['Parch'] + 1
# test_features['IsAlone'] = np.where(test_features['FamilySize'] <= 1, 1, 0)
test_features['Embarked'] = test_features['Embarked'].fillna('X')
test_features['Age'] = test_features.groupby(['Sex','Pclass'], sort=False)['Age'].apply(lambda x: x.fillna(x.median()))
test_features['Fare'] = test_features.groupby(['Sex','Pclass'], sort=False)['Fare'].apply(lambda x: x.fillna(x.median()))
# test_features['Has_Cabin'] = test_features['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test_features['Cabin'] = test_features['Cabin'].fillna('X').map(lambda x: x[0].strip())

test_unseen = model.predict(test_features)
test_df['Survived'] = test_unseen.tolist()
test_submission = test_df[['PassengerId', 'Survived']]
test_submission.to_csv(file_out, index=False)