### Trialling TPOT

Having not used TPOT before, this could be a good opportunity to compare it to the model I produced after some basic EDA and modelling.

This notebook needs significant work!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from tpot import TPOTClassifier

In [None]:
df = pd.read_csv("../data/raw/train.csv")

In [None]:
df.head()

In [None]:
target = df[['Survived']].rename(columns={'Survived': 'class'})
features = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin']]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
categorical = ['Pclass', 'Sex', 'Embarked']
numeric = ['Age', 'SibSp', 'Parch', 'Fare']

In [None]:
X_train['FamilySize'] = X_train['SibSp'] + X_train['Parch'] + 1
X_train['IsAlone'] = np.where(X_train['FamilySize'] <= 1, 1, 0)
# X_train['Sex'] = X_train['Sex'].map({'male':0,'female':1})
# X_train['Embarked'] = X_train['Embarked'].map({'S':0,'C':1,'Q':2})
X_train['Embarked'] = X_train['Embarked'].fillna('X')
X_train['Age'] = X_train.groupby(['Sex','Pclass'], sort=False)['Age'].apply(lambda x: x.fillna(x.median()))
X_train['Fare'] = X_train.groupby(['Sex','Pclass'], sort=False)['Fare'].apply(lambda x: x.fillna(x.median()))
X_train['Has_Cabin'] = X_train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
# X_train['Cabin'] = X_train['Cabin'].fillna('X').map(lambda x: x[0].strip())
X_train['Cabin'] = X_train['Cabin'].fillna('X').map(lambda x: x[0].strip())
cabin_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
             'F': 6, 'G': 7, 'T': 1, 'X': 8}
X_train['Cabin'] = X_train['Cabin'].str[0].fillna('X').replace(cabin_map)

In [None]:
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1
X_test['IsAlone'] = np.where(X_test['FamilySize'] <= 1, 1, 0)
# X_test['Sex'] = X_test['Sex'].map({'male':0,'female':1})
# X_train['Embarked'] = X_train['Embarked'].map({'S':0,'C':1,'Q':2})
X_test['Embarked'] = X_test['Embarked'].fillna('X')
X_test['Age'] = X_test.groupby(['Sex','Pclass'], sort=False)['Age'].apply(lambda x: x.fillna(x.median()))
X_test['Fare'] = X_test.groupby(['Sex','Pclass'], sort=False)['Fare'].apply(lambda x: x.fillna(x.median()))
X_test['Has_Cabin'] = X_test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
X_test['Cabin'] = X_test['Cabin'].fillna('X').map(lambda x: x[0].strip())
cabin_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
             'F': 6, 'G': 7, 'T': 1, 'X': 8}
X_test['Cabin'] = X_test['Cabin'].str[0].fillna('X').replace(cabin_map)

In [None]:
X_train.head()

In [None]:
X_train.describe()

In [None]:
X_test.head()

In [None]:
X_test.describe()

In [None]:
categorical = ['Pclass', 'Sex', 'Embarked', 'Cabin', 'IsAlone', 'Has_Cabin']
numeric = ['Age', 'Fare', 'FamilySize']

preprocessor = make_column_transformer(
    (StandardScaler(), numeric),
    (OneHotEncoder(drop='if_binary'), categorical), 
    remainder='drop')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = make_pipeline(
    preprocessor,
    GradientBoostingClassifier())

_ = model_gb.fit(X_train, np.ravel(y_train))

In [None]:
y_pred_gb = model_gb.predict(X_train)
print(classification_report(y_train, y_pred_gb))

In [None]:
y_unseen_gb = model_gb.predict(X_test)
print(classification_report(y_test, y_unseen_gb))

In [None]:
file_out = "../data/inference/basic_eda_xgb.csv"
test_df = pd.read_csv("../data/raw/test.csv")
test_df.head()
test_features = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin']]
test_features['FamilySize'] = test_features['SibSp'] + test_features['Parch'] + 1
test_features['IsAlone'] = np.where(test_features['FamilySize'] <= 1, 1, 0)
test_features['Embarked'] = test_features['Embarked'].fillna('X')
test_features['Age'] = test_features.groupby(['Sex','Pclass'], sort=False)['Age'].apply(lambda x: x.fillna(x.median()))
test_features['Fare'] = test_features.groupby(['Sex','Pclass'], sort=False)['Fare'].apply(lambda x: x.fillna(x.median()))
test_features['Has_Cabin'] = test_features['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test_features['Cabin'] = test_features['Cabin'].fillna('X').map(lambda x: x[0].strip())
cabin_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
             'F': 6, 'G': 7, 'T': 1, 'X': 8}
test_features['Cabin'] = test_features['Cabin'].str[0].fillna('X').replace(cabin_map)
test_unseen = model_gb.predict(test_features)
test_df['Survived'] = test_unseen.tolist()
test_submission = test_df[['PassengerId', 'Survived']]
test_submission.to_csv(file_out, index=False)

In [None]:
X_train['Pclass'] = X_train['Pclass'].astype('category')
X_train['Sex'] = X_train['Sex'].astype('category')
X_train['Embarked'] = X_train['Embarked'].astype('category')
X_train['Cabin'] = X_train['Cabin'].astype('category')
X_train['IsAlone'] = X_train['IsAlone'].astype('category')
X_train['Has_Cabin'] =  X_train['Has_Cabin'].astype('category')

In [None]:
X_train.dtypes

In [None]:
pipeline_optimizer = TPOTClassifier(population_size=20, cv=5,
                                    random_state=42, verbosity=2, config_dict='TPOT light',
                                    max_time_mins=30, scoring='accuracy')

In [None]:
t_num = ['Age', 'SibSp', 'Parch', 'Fare']
t_cat = ['Pclass', 'Sex', 'Embarked', 'Cabin', 'IsAlone', 'Has_Cabin']

preprocessor = make_column_transformer(
    (StandardScaler(), t_num),
    (OneHotEncoder(drop='if_binary'), t_cat), 
    remainder='drop')

model = make_pipeline(
    preprocessor,
    pipeline_optimizer)

_ = model.fit(X_train, np.ravel(y_train))