In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.dict_vectorizer import DictVectorizer

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV

from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler

INPUT_DIR = '../input'

N_FOLDS = 5
N_ITER = 50
SEED = 32

In [None]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


def min_max_scale(train_data, test_data, cat_cols):
    
    data = pd.concat([train_data, test_data])
    
    # numeric attributes
    num_data = data.drop(cat_cols, axis=1)
    
    # fit scaler on all data
    scaler = MinMaxScaler().fit(num_data)
    
    # transform all data with scaler
    train_data = scaler.transform(train_data.drop(cat_cols, axis=1))
    test_data = scaler.transform(test_data.drop(cat_cols, axis=1))
    
    # scale to <0,1>
    num_train_data = pd.DataFrame(train_data)
    num_test_data = pd.DataFrame(test_data)

    # fill nan with mean column values
    num_train_data.fillna(data.mean(), inplace=True)
    num_test_data.fillna(data.mean(), inplace=True)

    return num_train_data, num_test_data


def cat_vectorize(train_data, test_data, num_cols):
    # categorical attributes
    cat_train_data = train_data.drop(num_cols, axis=1)
    cat_test_data = test_data.drop(num_cols, axis=1)

    cat_train_data.fillna('NA', inplace=True)
    cat_test_data.fillna('NA', inplace=True)

    cat_train_data_values = cat_train_data.T.to_dict().values()
    cat_test_data_values = cat_test_data.T.to_dict().values()

    # vectorize (encode as one hot)
    vectorizer = DictVectorizer(sparse=False)
    vec_train_data = vectorizer.fit_transform(cat_train_data_values)
    vec_test_data = vectorizer.transform(cat_test_data_values)

    return vec_train_data, vec_test_data


In [None]:
""" -------------------------------------- Data loading ---------------------------------------- """

# load dataframes
df_train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
df_test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

df_full = [df_train, df_test]

print(df_train.head())


In [None]:
""" -------------------------------------- Feature Engineering ---------------------------------------- """

for dataset in df_full:
    dataset['Last_Name'] = dataset['Name'].apply(lambda x: str.split(x, ",")[0])
    
    dataset['Name_length'] = dataset['Name'].apply(len)

    # Feature that tells whether a passenger had a cabin on the Titanic
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

    # Create new feature FamilySize as a combination of SibSp and Parch
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

    # Create new feature IsAlone from FamilySize
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    # Remove all NULLS in the Embarked column
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

    # Remove all NULLS in the Fare column and create a new feature CategoricalFare
    dataset['Fare'] = dataset['Fare'].fillna(df_train['Fare'].median())

#     df_train['CategoricalFare'] = pd.qcut(df_train['Fare'], 4)

    # Create a New feature CategoricalAge
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

#     df_train['CategoricalAge'] = pd.cut(df_train['Age'], 5)

    # Create a new feature Title, containing the titles of passenger names
    dataset['Title'] = dataset['Name'].apply(get_title)

    # Group all non-common titles into one single grouping "Rare"
    dataset['Title'] = dataset['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    # Mapping Fare
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    # Mapping Age
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4

    dataset = dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'], axis = 1)
    
print(df_train.head(10))

In [None]:
""" -------------------------------------- Feature preparation ---------------------------------------- """

label_column = 'Survived'

# get all column names
cols = list(df_train.columns.values)

# numeric columns
num_cols = [e for e in df_train.select_dtypes(include=[np.number]).columns.tolist() if e != label_column]

# categorical columns
cat_cols = [e for e in cols if e not in num_cols and e != label_column]

print(num_cols, cat_cols)

x_train, y_train = df_train.drop(label_column, axis=1), df_train[label_column]
x_test = df_test

# scale everything to [0, 1]
x_num_train, x_num_test = min_max_scale(x_train, x_test, cat_cols)

# vectorize categorical columns
vec_x_cat_train, vec_x_cat_test = cat_vectorize(x_train, x_test, num_cols)

# build the feature vector
x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))

# labels or target attribute
y_train = y_train.astype(int)

In [None]:
""" -------------------------------------- Correlation report ---------------------------------------- """

# plt.figure(figsize=(14, 12))
# plt.title('Pearson Correlation of Features', y=1.05, size=15)
# sns.heatmap(pd.DataFrame(x_train).astype(float).corr(), linewidths=0.1, vmax=1.0, 
#             square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True)

In [None]:
""" -------------------------------------- Cross validation ---------------------------------------- """

# split the data into train and test
# x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

In [None]:
""" ---------------------------------- Grid params initialization ----------------------------------- """

MODELS = {
#     'lr': {
#         'model': LogisticRegression,
#         'params': {
#             'fit_intercept': [True, False],
#             'multi_class': ['ovr'],
#             'penalty': ['l2'],
#             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#             'tol': [0.01, 0.05, 0.1, 0.5, 1, 5]
#         }
#     },
#     'lrcv': {
#         'model': LogisticRegressionCV,
#         'params': {
#             'Cs': [1, 2, 4, 8, 16, 32],
#             'fit_intercept': [True, False],
#             'refit': [True, False],
#             'multi_class': ['ovr'],
#             'penalty': ['l2'],
#             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#             'tol': [0.01, 0.05, 0.1, 0.5, 1, 5],
#             'cv': [cv]
#         },
#         'best_params': {'tol': 0.05, 'solver': 'newton-cg', 'refit': True, 'penalty': 'l2', 'multi_class': 'ovr', 'fit_intercept': False, 'cv': 4, 'Cs': 2},
#         'best_score': 0.8428731762065096
#     },
    'svc': {
        'model': SVC,
        'params': {
            'C': [0.1, 0.5, 1., 2., 4.],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['auto', 'scale'],
            'degree': range(5),
            'shrinking': [True, False],
            'probability': [True, False],
            'tol': [0.01, 0.05, 0.1, 0.5, 1, 5],
        },
        'best_params': {'tol': 0.5, 'shrinking': False, 'probability': True, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 4.0},
        'best_score': 0.8496071829405163
    },
#     'dt': {
#         'model': DecisionTreeClassifier,
#         'params': {
#             'criterion': ['gini', 'entropy'],
#             'max_depth': range(6, 10),
#             'max_features': ['auto', 'sqrt', 'log2', None],
#             'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
#             'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
#         }
#     },
    'rf': {
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': range(10, 251, 20),
            'max_features': ['auto', 'sqrt', 'log2', None],
            'max_depth': range(5, 20),
            'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
            'bootstrap': [True, False], # Method of selecting samples for training each tree,
            'random_state': [SEED],
            'n_jobs': [-1]
        },
        'best_params': {'random_state': 32, 'n_jobs': -1, 'n_estimators': 250, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 8, 'bootstrap': True},
        'best_score': 0.8204264870931538
    },
    'ada': {
        'model': AdaBoostClassifier,
        'params': {
            'n_estimators': range(10, 251, 20),
            'learning_rate': [.01, .05, .1, .2, .5, 1, 2],
            'algorithm': ['SAMME', 'SAMME.R'],
            'random_state': [SEED],
        },
        'best_params': {'random_state': 32, 'n_estimators': 230, 'learning_rate': 0.05, 'algorithm': 'SAMME'},
        'best_score': 0.8148148148148148
    },
    'et': {
        'model': ExtraTreesClassifier,
        'params': {
            'n_estimators': range(10, 251, 20),
            'max_features': ['auto', 'sqrt', 'log2', None],
            'max_depth': range(5, 20),
            'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
            'bootstrap': [True, False], # Method of selecting samples for training each tree,
            'random_state': [SEED],
            'n_jobs': [-1]
        },
        'best_params': {'random_state': 32, 'n_jobs': -1, 'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 11, 'bootstrap': False},
        'best_score': 0.8237934904601572
    },
    'gb': {
        'model': GradientBoostingClassifier,
        'params': {
            'n_estimators': range(10, 251, 20),
            'max_depth': range(5, 20),
            'loss': ['deviance', 'exponential'],
            'learning_rate': [.01, .05, .1, .2, .5, 1, 2],                      
            'subsample': [.25, .5, .8, 1.],
            'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
            'random_state': [SEED],
#             'n_jobs': [-1]
        },
        'best_params': {'subsample': 0.25, 'random_state': 32, 'n_estimators': 130, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 19, 'loss': 'exponential', 'learning_rate': 0.1},
        'best_score': 0.8204264870931538
    }
#     'xgb': {
#         'model': XGBClassifier,
#         'params': {
#             'n_estimators': range(8, 20),
#             'max_depth': range(5, 20),
#             'learning_rate': [.01, .05, .1, .2, .5, 1, 2],
#             'colsample_bytree': [.6, .7, .8, .9, 1]
#         }
#     }
}

In [None]:
""" -------------------------------------- Best Linear Regression search ---------------------------------------- """

for k, model in MODELS.items():
    
    if 'best_score' in model:
        # Initialize with best parameters & fit to data
        print(f'Fitting {k}...')
        model['best_estimator'] = model['model'](**model['best_params']).fit(x_train, y_train)
        
        scores = cross_val_score(model['best_estimator'], x_train, y_train, cv=N_FOLDS)
        score = sum(scores) / len(scores)
        diff = score - model['best_score']
        if diff > 0:
            print(f'Accuracy of model {k}: {score} (BIGGER for {diff})')
            
        elif diff < 0:
            print(f'Accuracy of model {k}: {score} (SMALLER for {-diff})')
        else:
            print(f'Accuracy of model {k}: {score} (SAME)')
    else:
        # Perform random search
        searcher = RandomizedSearchCV(param_distributions=model['params'],
                                      estimator=model['model'](), scoring="accuracy",
                                      verbose=1, n_iter=N_ITER, cv=N_FOLDS)
        # Fit to data
        print(f'Fitting {k}...')    
        searcher.fit(x_train, y_train)

        # Print the best parameters and best accuracy
        print(f'Best parameters found for {k}: {searcher.best_params_}')
        print(f'Best accuracy found {k}: {searcher.best_score_}')

        model['best_estimator'] = searcher.best_estimator_
        model['best_params'] = searcher.best_params_
        model['best_score'] = searcher.best_score_
    

In [None]:
""" ---------------------------------- Preparing 2nd level features ------------------------------------ """

n_train = len(x_train)
n_test = len(x_test)

k_folds = KFold(n_splits=N_FOLDS, random_state=SEED)

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((n_train,))
    oof_test = np.zeros((n_test,))
    oof_test_skf = np.empty((N_FOLDS, n_test))

    for i, (train_index, test_index) in enumerate(k_folds.split()):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Create our OOF train and test predictions. These base results will be used as new features
# et_oof_train, et_oof_test = get_oof(MODEL['et']['best_estimator'], x_train, y_train, x_test) # Extra Trees
# rf_oof_train, rf_oof_test = get_oof(MODEL['rf']['best_estimator'], x_train, y_train, x_test) # Random Forest
# ada_oof_train, ada_oof_test = get_oof(MODEL['ada']['best_estimator'], x_train, y_train, x_test) # AdaBoost 
# gb_oof_train, gb_oof_test = get_oof(MODEL['gb']['best_estimator'], x_train, y_train, x_test) # Gradient Boost
# svc_oof_train, svc_oof_test = get_oof(MODEL['svc']['best_estimator'], x_train, y_train, x_test) # Support Vector Classifier

et_oof_train, et_oof_test = MODELS['et']['best_estimator'].predict(x_train), MODELS['et']['best_estimator'].predict(x_test)
rf_oof_train, rf_oof_test = MODELS['rf']['best_estimator'].predict(x_train), MODELS['rf']['best_estimator'].predict(x_test)
ada_oof_train, ada_oof_test = MODELS['ada']['best_estimator'].predict(x_train), MODELS['ada']['best_estimator'].predict(x_test)
gb_oof_train, gb_oof_test = MODELS['gb']['best_estimator'].predict(x_train), MODELS['gb']['best_estimator'].predict(x_test)
svc_oof_train, svc_oof_test = MODELS['svc']['best_estimator'].predict(x_train), MODELS['svc']['best_estimator'].predict(x_test)

X_train = np.vstack(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train)).T
X_test = np.vstack(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test)).T
    

In [None]:
print(len(X_train), len(y_train))

In [None]:
""" ----------------------------------- Fitting XGBoost classifier ------------------------------------- """

xgb_params = {
    'n_estimators': range(20, 501, 20),
    'max_depth': range(4, 21, 4),
    'learning_rate': [.01, .05, .1, .2, .5, 1, 2],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}
xgb = XGBClassifier(**{'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.2, 'colsample_bytree': 0.6})

# Perform random search
# searcher = RandomizedSearchCV(param_distributions=xgb_params,
#                               estimator=XGBClassifier(), scoring="accuracy",
#                               verbose=1, n_iter=N_ITER, cv=N_FOLDS)
# Fit to data
print(f'Fitting {k}...')    
xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)

In [None]:
# pred = MODELS[max(MODELS, key=lambda k: MODELS[k]['best_score'])]['best_estimator'].predict(x_test)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': pred})
submission.to_csv('gender_submission.csv', index=False)