In [531]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [532]:
def load_train_dataset(path):
    csv_path = os.path.join(path, "train.csv")
    return pd.read_csv(csv_path)

def load_test_dataset(path):
    csv_path = os.path.join(path, "test.csv")
    return pd.read_csv(csv_path)

In [533]:
train = load_train_dataset("input/titanic")
test = load_test_dataset("input/titanic")

print(len(train))
print(len(test))

891
418


In [534]:
from sklearn.base import BaseEstimator, TransformerMixin

class WasAccompainedTranformer(BaseEstimator, TransformerMixin):
    """
    Extracts the information of wether the passenger had anyone (spouses, children, siblings or parents) with them during the trip.
    """
    def fit(self):
        return self
    
    def transform(self, X):
        was_accompained = [1.0 if sibsp or parch else 0.0 for (index, (sibsp, parch)) in X[['SibSp', 'Parch']].iterrows()]
        return_df = X.copy()
        return_df['WasAccompained'] = was_accompained

        return return_df

In [535]:
class HadCabinTransformer(BaseEstimator, TransformerMixin):
    """
    Extracts if it has a cabin or not and then drop the attribute Cabin itself.
    There are too many NaN cabin's to do basically anything else with it.
    """
    def fit(self):
        return self
    
    def transform(self, X):
        had_cabin = [0.0 if type(cabin) == float else 1.0 for cabin in X['Cabin']]

        return_df = X.copy()
        return_df['HadCabin'] = had_cabin
        return_df = return_df.drop('Cabin', axis=1)

        return return_df

In [536]:
class FareOutlierRemover(BaseEstimator, TransformerMixin):
    """
    Removes outlers from the attribute Fare based on the inferior_limit and superior_limit attributes of the class.
    Defaults values (determined by the training set outliers):
    inferior_limit=0
    superior_limit=500
    """
    def __init__(self, inferior_limit=0, superior_limit=500):
        self.inferior_limit = inferior_limit
        self.superior_limit = superior_limit

    def fit(self):
        return self
    
    def transform(self, X):
        # calculating outlier-free mean
        X_outlier_mask = (X['Fare'] <= self.inferior_limit) | (X['Fare'] > self.superior_limit)
        df_without_fare_outliers = X[X_outlier_mask]
        clean_mean = df_without_fare_outliers['Fare'].mean()
        
        return_df = X.copy()
        return_df_outlier_mask = (return_df['Fare'] <= self.inferior_limit) | (return_df['Fare'] > self.superior_limit)
        return_df.loc[return_df_outlier_mask, 'Fare'] = clean_mean

        return return_df

In [537]:
class TitleTransformer(BaseEstimator, TransformerMixin):
    """
    Extracts titles from the names to try and get some useful info out of it.
    """
    def fit(self):
        return self
    
    def transform(self, X):
        titles = [name[name.index(',')+2:name.index('.')] for name in X['Name']]

        return_df = X.copy()

        return_df['Title'] = titles

        # grouping rare titles into the title "Rare" to avoid excessive dimensions when OneHotEncoding
        rare_titles = ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady',
               'Major', 'Rev', 'Sir', 'the Countess', 'Dona', 'Mlle', 'Mme']

        return_df['Title'] = return_df['Title'].replace(rare_titles, 'Rare')
        return_df['Title'] = return_df['Title'].replace('Ms', 'Miss')

        return_df = return_df.drop('Name', axis=1)

        return return_df

In [538]:
class AgeInputer(BaseEstimator, TransformerMixin):
    """
    Inputs Age based on the titles, so titles must already be in the df, which can be done through TitleTransformer.transform().
    """
    def fit(self):
        return self
    
    def transform(self, X):
        return_df = X.copy()

        mr = return_df[return_df['Title'] == "Mr"]
        master = return_df[return_df['Title'] == "Master"]
        mrs = return_df[return_df['Title'] == "Mrs"]
        miss = return_df[return_df['Title'] == "Miss"]

        age_mr = mr['Age'].mean()
        age_master = master['Age'].mean()
        age_mrs = mrs['Age'].mean()
        age_miss = miss['Age'].mean()
        age_rare = return_df['Age'].mean()

        return_df.loc[(return_df['Title'] == "Mr") & (return_df['Age'].isnull()), 'Age'] = age_mr
        return_df.loc[(return_df['Title'] == "Master") & (return_df['Age'].isnull()), 'Age'] = age_master
        return_df.loc[(return_df['Title'] == "Mrs") & (return_df['Age'].isnull()), 'Age'] = age_mrs
        return_df.loc[(return_df['Title'] == "Miss") & (return_df['Age'].isnull()), 'Age'] = age_miss
        return_df.loc[(return_df['Title'] == "Rare") & (return_df['Age'].isnull()), 'Age'] = age_rare

        return return_df

In [539]:
class NaDropper(BaseEstimator, TransformerMixin):
    def fit(self):
        return self
    
    def transform(self, X):
        return_df = X.dropna()

        return return_df

In [540]:
class OneHotTranformer(BaseEstimator, TransformerMixin):
    """
    Applies OneHotEncoder on the given columns via .transform() method
    """
    def __init__(self, categorical_columns: list):
        self.categorical_columns = categorical_columns

    def fit(self):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder()
        onehot_encoded = encoder.fit_transform(X[self.categorical_columns])
        onehot_df = pd.DataFrame(onehot_encoded.toarray(), columns=encoder.get_feature_names_out(self.categorical_columns), index=X.index)
        encoded_df = pd.concat([X, onehot_df], axis=1)
        encoded_df = encoded_df.drop(self.categorical_columns, axis=1)

        return encoded_df

In [541]:
class IsChildTransformer(BaseEstimator, TransformerMixin):
    """
    Extracts a categorical attribute that says if the passenger is a minor.
    """
    def fit(self):
        return self
    
    def transform(self, X):
        return_df = X.copy()

        isChild = [1.0 if age < 18 else 0.0 for age in X['Age']]

        return_df['IsChild'] = isChild

        return return_df

In [542]:
class FamilySizeTransformer(BaseEstimator, TransformerMixin):
    """
    Extracts a quantitative attribute based on Parch + Sibsp + 1
    """

    def fit(self):
        return self
    
    def transform(self, X):
        return_df = X.copy()

        family_sizes = [sibsp + parch + 1 for (index, (sibsp, parch)) in return_df[['SibSp', 'Parch']].iterrows()]

        return_df['FamilySize'] = family_sizes

        return return_df

In [543]:
class AgeGroupTransformer(BaseEstimator, TransformerMixin):
    """
    Extracts age groups
    """
    
    def fit(self):
        return self
    
    def transform(self, X):
        return_df = X.copy()

        return_df['Baby'] = (return_df['Age'] <= 5)
        return_df['Child'] = (return_df['Age'] > 5) & (return_df['Age'] <= 14)
        return_df['Teenager'] = (return_df['Age'] > 14) & (return_df['Age'] <= 18)
        return_df['Adult'] = (return_df['Age'] > 18) & (return_df['Age'] <= 30)
        return_df['OldAdult'] = (return_df['Age'] > 30) & (return_df['Age'] <= 60)
        return_df['Old'] = (return_df['Age'] > 60)

        return return_df

In [544]:
from sklearn.pipeline import Pipeline

# removing attributes below because they don't represent useful info for model training
train = train.drop(["Ticket"], axis=1)
test = test.drop(['Ticket'], axis=1)

# there is only one row with null Fare, its Pclass is 3
# inputting the fare mean from the Pclass 3
mean_pclass3 = test[test['Pclass'] == 3]['Fare'].mean()
test = test.fillna({'Fare': mean_pclass3})

print(train)

cleaning_extracting_pipeline = Pipeline([
    ('was_accompained', WasAccompainedTranformer()),
    ('had_cabin', HadCabinTransformer()),
    ('fare_outlier_remover', FareOutlierRemover()),
    ('title', TitleTransformer()),
    ('age_inputer', AgeInputer()),
    ('na_dropper', NaDropper()),
    ('is_child', IsChildTransformer()),
    ('family_size', FamilySizeTransformer()),
    ('one_hot', OneHotTranformer(['Sex', 'Embarked', 'Title']))
])

# train = train.drop('Cabin', axis=1)
prepared_train = cleaning_extracting_pipeline.transform(train)

prepared_test = cleaning_extracting_pipeline.transform(test)
prepared_test

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,WasAccompained,HadCabin,IsChild,FamilySize,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,892,3,34.500000,0,0,7.8292,0.0,0.0,0.0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,893,3,47.000000,1,0,7.0000,1.0,0.0,0.0,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,894,2,62.000000,0,0,9.6875,0.0,0.0,0.0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,895,3,27.000000,0,0,8.6625,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,896,3,22.000000,1,1,12.2875,1.0,0.0,0.0,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,32.000000,0,0,8.0500,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
414,1306,1,39.000000,0,0,108.9000,0.0,1.0,0.0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
415,1307,3,38.500000,0,0,7.2500,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
416,1308,3,32.000000,0,0,8.0500,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [545]:
corr_matrix = prepared_train.corr()
corr_matrix['Survived']

PassengerId      -0.005028
Survived          1.000000
Pclass           -0.335549
Age              -0.093155
SibSp            -0.034040
Parch             0.083151
Fare              0.238411
WasAccompained    0.206207
HadCabin          0.313435
IsChild           0.124972
FamilySize        0.018277
Sex_female        0.541585
Sex_male         -0.541585
Embarked_C        0.169966
Embarked_Q        0.004536
Embarked_S       -0.151777
Title_Master      0.085998
Title_Miss        0.329227
Title_Mr         -0.547689
Title_Mrs         0.337892
Title_Rare        0.014509
Name: Survived, dtype: float64

TREINANDO OS MODELOS

In [546]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [547]:
# separando o atributo target e os id's do dataset de treino
ids_train = prepared_train['PassengerId']
Y = prepared_train['Survived']
X = prepared_train.drop(['PassengerId', 'Survived'], axis=1)

In [548]:
# separando o atributo target e os id's do dataset de teste
ids_test = prepared_test['PassengerId']

X_test = prepared_test.drop(['PassengerId'], axis=1)

In [549]:
voting_clf = VotingClassifier(
    estimators=[('xgbc', XGBClassifier()), ('cbc', CatBoostClassifier(verbose=False)), ('lgbmc', LGBMClassifier())],
    voting='soft'
)

voting_clf.fit(X, Y)

train_predictions = cross_val_predict(voting_clf, X, Y, cv=5)

print(accuracy_score(Y, train_predictions))

[LightGBM] [Info] Number of positive: 340, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 256
[LightGBM] [Info] Number of data points in the train set: 889, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382452 -> initscore=-0.479153
[LightGBM] [Info] Start training from score -0.479153
[LightGBM] [Info] Number of positive: 272, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 711, number of used features: 19
[LightGBM] [Info] [binary:BoostF

In [550]:
test_predictions = pd.DataFrame(voting_clf.predict(X_test))

result = pd.DataFrame()

result['PassengerId'] = ids_test
result['Survived'] = test_predictions

result
result.to_csv("submission.csv", index=False)