In [107]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.dict_vectorizer import DictVectorizer

from sklearn.metrics.classification import classification_report, accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler

INPUT_DIR = './input'

In [108]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


def min_max_scale(train_data, test_data, cat_cols):
    
    data = pd.concat([train_data, test_data])
    
    # numeric attributes
    num_data = data.drop(cat_cols, axis=1)
    
    # fit scaler on all data
    scaler = MinMaxScaler().fit(num_data)
    
    # transform all data with scaler
    train_data = scaler.transform(train_data.drop(cat_cols, axis=1))
    test_data = scaler.transform(test_data.drop(cat_cols, axis=1))
    
    # scale to <0,1>
    num_train_data = pd.DataFrame(train_data)
    num_test_data = pd.DataFrame(test_data)

    # fill nan with mean column values
    num_train_data.fillna(data.mean(), inplace=True)
    num_test_data.fillna(data.mean(), inplace=True)

    return num_train_data, num_test_data


def cat_vectorize(train_data, test_data, num_cols):
    # categorical attributes
    cat_train_data = train_data.drop(num_cols, axis=1)
    cat_test_data = test_data.drop(num_cols, axis=1)

    cat_train_data.fillna('NA', inplace=True)
    cat_test_data.fillna('NA', inplace=True)

    cat_train_data_values = cat_train_data.T.to_dict().values()
    cat_test_data_values = cat_test_data.T.to_dict().values()

    # vectorize (encode as one hot)
    vectorizer = DictVectorizer(sparse=False)
    vec_train_data = vectorizer.fit_transform(cat_train_data_values)
    vec_test_data = vectorizer.transform(cat_test_data_values)

    return vec_train_data, vec_test_data


In [109]:
""" -------------------------------------- Data loading ---------------------------------------- """

# load dataframes
df_train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
df_test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

df_full = [df_train, df_test]

print(df_train.head())


   PassengerId  Survived  Pclass    ...        Fare Cabin  Embarked
0            1         0       3    ...      7.2500   NaN         S
1            2         1       1    ...     71.2833   C85         C
2            3         1       3    ...      7.9250   NaN         S
3            4         1       1    ...     53.1000  C123         S
4            5         0       3    ...      8.0500   NaN         S

[5 rows x 12 columns]


In [110]:
""" -------------------------------------- Feature Engineering ---------------------------------------- """

for dataset in df_full:
    dataset['Name_length'] = dataset['Name'].apply(len)

    # Feature that tells whether a passenger had a cabin on the Titanic
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

    # Create new feature FamilySize as a combination of SibSp and Parch
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

    # Create new feature IsAlone from FamilySize
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    # Remove all NULLS in the Embarked column
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

    # Remove all NULLS in the Fare column and create a new feature CategoricalFare
    dataset['Fare'] = dataset['Fare'].fillna(df_train['Fare'].median())

    # df_train['CategoricalFare'] = pandas.qcut(df_train['Fare'], 4)

    # Create a New feature CategoricalAge
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

    # df_train['CategoricalAge'] = pandas.cut(df_train['Age'], 5)

    # Create a new feature Title, containing the titles of passenger names
    dataset['Title'] = dataset['Name'].apply(get_title)

    # Group all non-common titles into one single grouping "Rare"
    dataset['Title'] = dataset['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    # Mapping Fare
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    # Mapping Age
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4

print(df_train.head(10))

   PassengerId  Survived  Pclass  ...   FamilySize  IsAlone  Title
0            1         0       3  ...            2        0      1
1            2         1       1  ...            2        0      3
2            3         1       3  ...            1        1      2
3            4         1       1  ...            2        0      3
4            5         0       3  ...            1        1      1
5            6         0       3  ...            1        1      1
6            7         0       1  ...            1        1      1
7            8         0       3  ...            5        0      4
8            9         1       3  ...            3        0      3
9           10         1       2  ...            2        0      3

[10 rows x 17 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [111]:
""" -------------------------------------- Feature preparation ---------------------------------------- """

label_column = 'Survived'

# get all column names
cols = list(df_train.columns.values)

# numeric columns
num_cols = [e for e in df_train.select_dtypes(include=[np.number]).columns.tolist() if e != label_column]

# categorical columns
cat_cols = [e for e in cols if e not in num_cols and e != label_column]

print(num_cols, cat_cols)

x_train, y_train = df_train.drop(label_column, axis=1), df_train[label_column]
x_test = df_test

# scale everything to [0, 1]
x_num_train, x_num_test = min_max_scale(x_train, x_test, cat_cols)

# vectorize categorical columns
vec_x_cat_train, vec_x_cat_test = cat_vectorize(x_train, x_test, num_cols)

# build the feature vector
x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))

# labels or target attribute
y_train = y_train.astype(int)

['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name_length', 'Has_Cabin', 'FamilySize', 'IsAlone', 'Title'] ['Name', 'Ticket', 'Cabin']


  return self.partial_fit(X, y)


In [112]:
""" -------------------------------------- Cross validation ---------------------------------------- """

# split the data into train and test
# x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

' -------------------------------------- Cross validation ---------------------------------------- '

In [115]:
""" ---------------------------------- Grid params initialization ----------------------------------- """
cv = 10
n_iter = 100

MODELS = {
#     'lr': {
#         'model': LogisticRegression,
#         'params': {
#             'fit_intercept': [True, False],
#             'multi_class': ['ovr'],
#             'penalty': ['l2'],
#             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#             'tol': [0.01, 0.05, 0.1, 0.5, 1, 5]
#         }
#     },
    'lrcv': {
        'model': LogisticRegressionCV,
        'params': {
            'Cs': [1, 2, 4, 8, 16, 32],
            'fit_intercept': [True, False],
            'refit': [True, False],
            'multi_class': ['ovr'],
            'penalty': ['l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'tol': [0.01, 0.05, 0.1, 0.5, 1, 5],
            'cv': [cv]
        },
#         'best_params': {'tol': 0.05, 'solver': 'newton-cg', 'refit': True, 'penalty': 'l2', 'multi_class': 'ovr', 'fit_intercept': False, 'cv': 4, 'Cs': 2},
#         'best_score': 0.8428731762065096
    },
    'svc': {
        'model': SVC,
        'params': {
            'C': [0.1, 0.5, 1., 2., 4.],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['auto', 'scale'],
            'degree': range(5),
            'shrinking': [True, False],
            'probability': [True, False],
            'tol': [0.01, 0.05, 0.1, 0.5, 1, 5],
        },
#         'best_params': {'tol': 0.5, 'shrinking': False, 'probability': True, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 4.0},
#         'best_score': 0.8496071829405163
    },
    'dt': {
        'model': DecisionTreeClassifier,
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': range(6, 10),
            'max_features': ['auto', 'sqrt', 'log2', None],
            'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
        }
    },
    'rf': {
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': range(5, 20),
            'max_features': ['auto', 'sqrt', 'log2', None],
            'max_depth': range(5, 20),
            'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
            'bootstrap': [True, False] # Method of selecting samples for training each tree
        }
    },
#     'xgb': {
#         'model': XGBClassifier,
#         'params': {
#             'n_estimators': range(8, 20),
#             'max_depth': range(6, 10),
#             'learning_rate': [.4, .45, .5, .55, .6],
#             'colsample_bytree': [.6, .7, .8, .9, 1]
#         }
#     }
}

In [None]:
""" -------------------------------------- Best Linear Regression search ---------------------------------------- """

for k, model in MODELS.items():
    
    if 'best_score' in model:
        # Initialize with best parameters & fit to data
        print(f'Fitting {k}...')
        model['best_estimator'] = model['model'](**model['best_params']).fit(x_train, y_train)
        
        scores = cross_val_score(model['best_estimator'], x_train, y_train, cv=cv)
        score = sum(scores) / len(scores)
        diff = score - model['best_score']
        if diff > 0:
            print(f'Accuracy of model {k}: {score} (BIGGER for {diff})')
            
        elif diff < 0:
            print(f'Accuracy of model {k}: {score} (SMALLER for {-diff})')
        else:
            print(f'Accuracy of model {k}: {score} (SAME)')
    else:
        # Perform random search
        searcher = RandomizedSearchCV(param_distributions=model['params'],
                                      estimator=model['model'](), scoring="accuracy",
                                      verbose=1, n_iter=n_iter, cv=cv)
        # Fit to data
        print(f'Fitting {k}...')    
        searcher.fit(x_train, y_train)

        # Print the best parameters and best accuracy
        print(f'Best parameters found for {k}: {searcher.best_params_}')
        print(f'Best accuracy found {k}: {searcher.best_score_}')

        model['best_estimator'] = searcher.best_estimator_
        model['best_params'] = searcher.best_params_
        model['best_score'] = searcher.best_score_
    

Fitting lrcv...
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
""" -------------------------------------- Fitting Voting classifier ---------------------------------------- """

# vc = VotingClassifier(estimators=[(k, v['best_estimator']) for k, v in MODELS.items()], voting='soft')
vc = VotingClassifier(estimators=[('lrcv', MODELS['lrcv']['best_estimator']), 
                                  ('svc', MODELS['svc']['best_estimator']),
                                  ('rf', MODELS['rf']['best_estimator'])], voting='soft')
vc = vc.fit(x_train, y_train)

scores = cross_val_score(vc, x_train, y_train, cv=cv)
score = sum(scores) / len(scores)
print(f'Accuracy of voting classifier: {score}')

In [None]:
""" -------------------------------------- Fitting Bagging classifier ---------------------------------------- """

vc = BaggingClassifier([(k, v['best_estimator']) for k, v in MODELS.items()], voting='soft')
# vc = BaggingClassifier(estimators=[('lrcv', MODELS['lrcv']['best_estimator']), 
#                                   ('svc', MODELS['svc']['best_estimator']),
#                                   ('rf', MODELS['rf']['best_estimator'])], voting='soft')
vc = vc.fit(x_train, y_train)

scores = cross_val_score(vc, x_train, y_train, cv=cv)
score = sum(scores) / len(scores)
print(f'Accuracy of voting classifier: {score}')

In [None]:
pred = MODELS[max(MODELS, key=lambda k: MODELS[k]['best_score'])]['best_estimator'].predict(x_test)
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': pred})
submission.to_csv('gender_submission.csv', index=False)