In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from xgboost import XGBClassifier

import sys 

print('python version:', sys.version)
print('pandas version:', pd.__version__)

In [None]:
dirc = '../../data/titanic/'
train = pd.read_csv(dirc + 'train.csv')
test = pd.read_csv(dirc + 'test.csv')
submission = pd.read_csv(dirc + 'gender_submission.csv')

data = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

In [None]:
data.head()

In [None]:
agefill=data.groupby(['SibSp']).Age.mean()
data.groupby(['SibSp']).Age.agg(['mean', 'count'])

In [None]:
print(data.isna().sum().sort_values(ascending=False))

plt.figure(figsize=(10, 10))
sns.heatmap(
    data.isna(),
    cbar=False, cmap='Reds', yticklabels=False
)

In [None]:
data1 = data.copy()

# fill Last Name
#data1['LN'] = data1.Name.map(lambda x: x.split(',')[0])

# drop these columns, either irrelevant or too many NaN 
data1.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

# fill Embarked with the most frequent values
most_embarked = data.Embarked.value_counts().idxmax()
data1.Embarked.fillna(most_embarked, inplace=True)

# convert Sex to 0 and 1 values
data1.Sex.replace({'male': 0, 'female': 1}, inplace=True)
data1.Sex.astype('int64')

# fill the only missed Fare value using mean
#data1.Fare.fillna(data1.Fare.mean(), inplace=True)

# handle Age_missed 
data1['Age_missed'] = data1.Age.isna()
#data1.Age.fillna(-1, inplace=True)

# use sibsp average to fill NaN in Age 
agefill=data.groupby(['SibSp']).Age.mean()
def sibspfillna(row): 
    if np.isnan(row.Age): 
        row['Age'] = agefill[row['SibSp']]
    return row
data1 = data1.apply(sibspfillna, axis=1)

data1.info(memory_usage='deep')

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(
    data1.drop('Embarked', axis=1).corr(), 
    annot=True, cmap='YlGnBu', linewidths=0.01, square=True
)

In [None]:
obj_col = [col for col in data1.columns 
                       if data1[col].dtype == 'object']
print('to be dummied:', obj_col)

if obj_col: 
    obj_dummies = pd.get_dummies(data1[obj_col])
    data1 = pd.concat([data1, obj_dummies], axis=1)
    data1.drop(obj_col, axis=1, inplace=True)

train_set = data1[data1.Survived.notna()]
x = train_set.drop('Survived', axis=1)
y = train_set['Survived']

test_set = data1[data1.Survived.isna()].drop('Survived', axis=1)

print(x.shape, test_set.shape)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size = 0.10)

model = XGBClassifier(max_depth=3, n_estimators=2000, 
                      learning_rate=0.01, n_jobs=1, min_samples_leaf=3, reg_alpha=0.1, reg_lambda=0.1)
fitpm={'eval_set': [(val_x, val_y)], 'early_stopping_rounds': 20, 'eval_metric': 'error', 'verbose': False}

cvs = cross_val_score(model, train_x, train_y, cv=5, n_jobs=4, verbose=2, fit_params=fitpm)
print('CV mean:', cvs.mean())

model.fit(train_x, train_y, **fitpm)
pred = model.predict(val_x)
print('train score:', accuracy_score(val_y, pred))

In [None]:
model.fit(x, y, **fitpm)
submission['Survived'] = model.predict(test_set).astype(int)
submission.to_csv('submission.csv', index=False)