In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline, make_union
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from utils import ColumnSelector

  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_csv('/mnt/c/Users/sahib/Documents/Titanic Project/train.csv')
test = pd.read_csv('/mnt/c/Users/sahib/Documents/Titanic Project/test.csv')

### Add more features

In [3]:
full_data = [train,test]
lb_make = LabelEncoder()

# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    dataset['Title_Code'] = lb_make.fit_transform(dataset["Title"])
    # encode port of embarkment
    dataset['Embarked_Code'] = lb_make.fit_transform(dataset["Embarked"].astype(str))
    # create 4 fare buckets
    dataset['FareCat'] = pd.cut(dataset['Fare'], 4)
    dataset['FareCat'] = lb_make.fit_transform(dataset["FareCat"].astype(str))    
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    # create 3 parch buckets
    dataset['ParchCat'] = pd.cut(dataset['Parch'], 3)
    dataset['ParchCat'] = lb_make.fit_transform(dataset["ParchCat"].astype(str))
    dataset['Sex'] = dataset['Sex'].map({"male":1,"female":0})

In [None]:
train.dtypes

## Split into training & validation set

In [4]:
# No feature engineering
features = ["Sex","Age","Fare","Pclass","SibSp","Parch","Embarked_Code"]
X = train[features].copy()
y = train["Survived"]
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=1/3., random_state=42)

In [5]:
# With feature engineering
features2 = ["Sex", "Age", "FareCat", "IsAlone", "ParchCat", "Title_Code","Pclass","Embarked_Code"]
X_ = train[features2].copy()
X_train2, X_validation2, y_train, y_validation = train_test_split(X_, y, test_size=1/3., random_state=42)

## Logistic Regression

### Logistic Regression w/o feature engineering

In [6]:
# Logistic Regression - no feature engineering
pipeline = Pipeline(steps = [
        ("features", make_union(
                make_pipeline(ColumnSelector(["Pclass","Sex"]), OneHotEncoder()),
                make_pipeline(ColumnSelector(["Age"]), 
                             Imputer(strategy="mean"),
                             StandardScaler()),
                make_pipeline(ColumnSelector(["Embarked_Code"]),
                             Imputer(strategy="most_frequent"),
                             OneHotEncoder()),
                ColumnSelector(["SibSp","Parch"]),
                )),
                ("model",LogisticRegression(random_state=42))
])

pipeline.fit(X_train, y_train)

pipeline.score(X_validation, y_validation)

0.8080808080808081

### Logistic Regression w/ feature engineering

In [7]:
pipeline = Pipeline(steps = [
        ("features", make_union(
                make_pipeline(ColumnSelector(["Pclass","Sex","Title_Code","ParchCat","IsAlone","FareCat"]), OneHotEncoder()),
                make_pipeline(ColumnSelector(["Age"]), 
                             Imputer(strategy="mean"),
                             StandardScaler()),
                make_pipeline(ColumnSelector(["Embarked_Code"]),
                             Imputer(strategy="most_frequent"),
                             OneHotEncoder()),
                )),
                ("model",LogisticRegression(random_state=42))
])

pipeline.fit(X_train2, y_train)

pipeline.score(X_validation2, y_validation)

0.8383838383838383

## Random Forest

### RF w/o feature engineering

In [17]:
pipeline = Pipeline(steps = [
        ("features", make_union(
                make_pipeline(ColumnSelector(["Pclass","Sex"]), OneHotEncoder()),
                make_pipeline(ColumnSelector(["Age"]), 
                             Imputer(strategy="mean"),
                             StandardScaler()),
                make_pipeline(ColumnSelector(["Embarked_Code"]),
                             Imputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore")),
                ColumnSelector(["SibSp","Parch"]),
                )),
                ("model",RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

print("RF Score before CV: %s" % pipeline.score(X_validation, y_validation))

hyperparameters = { 'model__max_depth': [50, 70],
                    'model__min_samples_leaf': [1,2]
                  }

clf = GridSearchCV(pipeline, hyperparameters, cv=5)

clf.fit(X_train, y_train)

print("RF Score after CV: %s" % clf.score(X_validation, y_validation))

RF Score before CV: 0.797979797979798
RF Score after CV: 0.8047138047138047


### RF w/ feature engineering

In [19]:
pipeline = Pipeline(steps = [
        ("features", make_union(
                make_pipeline(ColumnSelector(["Pclass","Sex","Title_Code","ParchCat","IsAlone","FareCat"]), OneHotEncoder()),
                make_pipeline(ColumnSelector(["Age"]), 
                             Imputer(strategy="mean"),
                             StandardScaler()),
                make_pipeline(ColumnSelector(["Embarked_Code"]),
                             Imputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore")),
                )),
                ("model",RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train2, y_train)

print("RF Score before CV: %s" % pipeline.score(X_validation2, y_validation))

hyperparameters = { 'model__max_depth': [50, 70],
                    'model__min_samples_leaf': [1,2]
                  }

clf = GridSearchCV(pipeline, hyperparameters, cv=5)

clf.fit(X_train2, y_train)

print("RF Score after CV: %s" % clf.score(X_validation2, y_validation))

RF Score before CV: 0.7946127946127947
RF Score after CV: 0.8080808080808081
