In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import CategoricalNB, BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, log_loss, make_scorer
import warnings
warnings.simplefilter('ignore')
import xgboost as xgb

In [None]:
sns.set_style("whitegrid")
sns.set_context("paper")

In [None]:
df = pd.read_csv("../data/raw/train.csv")
df.head()

In [None]:
test_df = pd.read_csv("../data/raw/test.csv")
test_df.head()

In [None]:
target = df[['Survived']]
features = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'Ticket']]

In [None]:
na_fill = {'Embarked': 'X', 'Cabin': 'X', 'Ticket': 'X'}
features.fillna(value=na_fill, inplace=True)

In [None]:
df_embarked = pd.get_dummies(features['Embarked'], prefix='Embarked')

features['Ticket'] = features['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X') \
                               .str.upper() \
                               .replace("[^a-zA-Z0-9]*", "", regex=True)
df_ticket = pd.get_dummies(features['Ticket'], prefix='Ticket')

features['Cabin'] = features['Cabin'].map(lambda x: x[0].strip())
df_cabin = pd.get_dummies(features['Cabin'], prefix='Cabin')

features.loc[(features['Fare'].isna()) & (features['Pclass']==1) & (features['Sex']=='female'), 'Fare']=85.40
features.loc[(features['Fare'].isna()) & (features['Pclass']==2) & (features['Sex']=='female'), 'Fare']=24.75
features.loc[(features['Fare'].isna()) & (features['Pclass']==3) & (features['Sex']=='female'), 'Fare']=12.54

age_map = features[['Age', 'Sex']].dropna().groupby(['Sex']).median().to_dict()
features['Age'] = features['Age'].fillna(features['Sex'].map(age_map['Age']))
bins1 = ['Y1', 'Y2', 'Y3', 'Y4', 'M1', 'M2', 'E']
features['Age_Bin'] = pd.cut(x=features['Age'],
                            bins=[0, 5 , 10, 15, 20, 30, 50,1000],
                            labels=bins1,right=False)
features['Age_Bin'] = features['Age_Bin'].astype('str')
features['Age_Bin'] = features['Age_Bin']+features['Sex']

df_age_bin = pd.get_dummies(features['Age_Bin'], prefix='Age_bin')

bins2 = ['L1', 'L2', 'L3', 'L4']
features['Fare_Bin'] = pd.cut(x=features['Fare'],
                            bins=[0,11, 30 , 60, 10000],
                            labels=bins2,right=False)
features['Fare_Bin'] = features['Fare_Bin'].astype('str')
df_fare_bin = pd.get_dummies(features['Fare_Bin'], prefix='Fare_bin')

features['Sex'] = features['Sex'].apply(lambda x: 1 if x=='female' else 0).astype('uint8')

df_pclass = pd.get_dummies(features['Pclass'], prefix='class')

features['FamilySize'] = features['Parch'] + features['SibSp'] + 1
features['Singleton'] = features['FamilySize'].map(lambda s: 1 if s == 1 else 0).astype('uint8')
features['SmallFamily'] = features['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0).astype('uint8')
features['LargeFamily'] = features['FamilySize'].map(lambda s: 1 if 5 <= s else 0).astype('uint8')

In [None]:
train = pd.concat([df_embarked, df_ticket, df_cabin, df_age_bin, df_fare_bin, df_pclass,
                  features['Singleton'], features['SmallFamily'], features['LargeFamily'],
                  features['Sex']], axis=1)

In [None]:
clf = GradientBoostingClassifier()

In [None]:
# naive bayes
no_param = {'fit_prior':[True, False], 'alpha':[0.1, 0.3, 0.6, 0.9]}

# gradient boosting classifier
mini_param = {
    "loss":["deviance"], 
    "learning_rate": [0.15, 0.3],
    "n_estimators": [200, 400],
    "subsample": [0.7, 0.9],
    "criterion": ["mse"],
    "max_depth": [15, 30],
    "max_features": ["sqrt", None],
    "random_state": [42],
    "validation_fraction": [0.1],
    "n_iter_no_change": [15]}

# adaboost classifier
ada_param = {
    "n_estimators": [100, 200, 500],
    "random_state": [42],
    "learning_rate": [0.1, 0.3, 0.9],
    }

In [None]:
grid_nb = GridSearchCV(clf, param_grid=mini_param, refit=True, scoring='accuracy', cv=3, n_jobs=1, verbose=3)

In [None]:
grid_nb.fit(train, np.ravel(target))

In [None]:
grid_nb.best_params_

In [None]:
train_acc = grid_nb.predict(train)
print(classification_report(target, train_acc))

In [None]:
print(accuracy_score(target, train_acc))

In [None]:
test_features = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'Ticket']]
test_features.fillna(value=na_fill, inplace=True)
test_df_embarked = pd.get_dummies(test_features['Embarked'], prefix='Embarked')

test_features['Ticket'] = test_features['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X') \
                               .str.upper() \
                               .replace("[^a-zA-Z0-9]*", "", regex=True)
test_df_ticket = pd.get_dummies(test_features['Ticket'], prefix='Ticket')

test_features['Cabin'] = test_features['Cabin'].map(lambda x: x[0].strip())
test_df_cabin = pd.get_dummies(test_features['Cabin'], prefix='Cabin')

test_features.loc[(test_features['Fare'].isna()) & (test_features['Pclass']==1) & (test_features['Sex']=='female'), 'Fare']=85.40
test_features.loc[(test_features['Fare'].isna()) & (test_features['Pclass']==2) & (test_features['Sex']=='female'), 'Fare']=24.75
test_features.loc[(test_features['Fare'].isna()) & (test_features['Pclass']==3) & (test_features['Sex']=='female'), 'Fare']=12.54

test_features['Age'] = test_features['Age'].fillna(test_features['Sex'].map(age_map['Age']))
test_features['Age_Bin'] = pd.cut(x=test_features['Age'],
                            bins=[0, 5 , 10, 15, 20, 30, 50,1000],
                            labels=bins1,right=False)
test_features['Age_Bin'] = test_features['Age_Bin'].astype('str')
test_features['Age_Bin'] = test_features['Age_Bin']+test_features['Sex']

test_df_age_bin = pd.get_dummies(test_features['Age_Bin'], prefix='Age_bin')

test_features['Fare_Bin'] = pd.cut(x=test_features['Fare'],
                            bins=[0,11, 30 , 60, 10000],
                            labels=bins2,right=False)
test_features['Fare_Bin'] = test_features['Fare_Bin'].astype('str')
test_df_fare_bin = pd.get_dummies(test_features['Fare_Bin'], prefix='Fare_bin')

test_features['Sex'] = test_features['Sex'].apply(lambda x: 1 if x=='female' else 0).astype('uint8')

test_df_pclass = pd.get_dummies(test_features['Pclass'], prefix='class')

test_features['FamilySize'] = test_features['Parch'] + features['SibSp'] + 1
test_features['Singleton'] = test_features['FamilySize'].map(lambda s: 1 if s == 1 else 0).astype('uint8')
test_features['SmallFamily'] = test_features['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0).astype('uint8')
test_features['LargeFamily'] = test_features['FamilySize'].map(lambda s: 1 if 5 <= s else 0).astype('uint8')

In [None]:
test = pd.concat([test_df_embarked, test_df_ticket, test_df_cabin, test_df_age_bin, test_df_fare_bin, test_df_pclass,
                  test_features['Singleton'], test_features['SmallFamily'], test_features['LargeFamily'],
                  test_features['Sex']], axis=1)

In [None]:
test_preds = grid_nb.predict(test)

In [None]:
print(len(test_preds))

In [None]:
file_out = "../data/inference/cv_ab.csv"
test_df['Survived'] = test_preds.tolist()
test_submission = test_df[['PassengerId', 'Survived']]
test_submission.to_csv(file_out, index=False)

In [None]:
ab = "../data/inference/cv_ab.csv"
xgb = "../data/inference/cv_acc_xgb.csv"
gbc = "../data/inference/cv_f1_gb.csv"
gbc_cat = "../data/inference/cv_gbc.csv"
nb = "../data/inference/cv_nb.csv"

In [None]:
ab_df = pd.read_csv(ab).rename(columns={'PassengerId': 'id1', 'Survived': 'ab'})
xgb_df = pd.read_csv(xgb).rename(columns={'PassengerId': 'id2', 'Survived': 'xgb'})
gbc_df = pd.read_csv(gbc).rename(columns={'PassengerId': 'id3', 'Survived': 'gbc'})
gbc_cat_df = pd.read_csv(gbc_cat).rename(columns={'PassengerId': 'id4', 'Survived': 'gbc_cat'})
nb_df = pd.read_csv(nb).rename(columns={'Survived': 'nb'})

In [None]:
ens = pd.concat([ab_df, xgb_df, gbc_df, gbc_cat_df, nb_df], axis=1)

In [None]:
ens.info()

In [None]:
ens['Survived'] = ens[['ab', 'xgb', 'gbc', 'gbc_cat', 'nb']].median(axis=1).astype('uint8')

In [None]:
ens[['PassengerId', 'Survived']].to_csv("../data/inference/ensemble.csv", index=False)