In [1]:
# imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Notebook variables
SEED_VALUE = 72
TRAIN_SIZE = 0.8
TEST_SIZE = 0.2

DEPTH = 5
ESTIMATORS = 5


trainDataPath = "../input/train.csv"
testDataPath = "../input/test.csv"

outputPath = "../output/submission-02.csv"

In [3]:
# Functions

def create_fare_bins(data, replacementValue):
    binData = data.copy()

    binData["Fare"] = binData["Fare"].fillna(replacementValue)

    binData.loc[binData["Fare"] >= 0, "f_Fare"] = 0
    binData.loc[binData["Fare"] > 25, "f_Fare"] = 1
    binData.loc[binData["Fare"] > 50, "f_Fare"] = 2
    binData.loc[binData["Fare"] > 75, "f_Fare"] = 3
    binData.loc[binData["Fare"] > 100, "f_Fare"] = 4

    return binData

def create_age_bins(data, replacementValue):
    binData = data.copy()

    binData["Age"] = binData["Age"].fillna(replacementValue)

    binData.loc[binData["Age"] >= 0, "f_Age"] = 0
    binData.loc[binData["Age"] > 10, "f_Age"] = 1
    binData.loc[binData["Age"] > 20, "f_Age"] = 2
    binData.loc[binData["Age"] > 30, "f_Age"] = 3
    binData.loc[binData["Age"] > 40, "f_Age"] = 4
    binData.loc[binData["Age"] > 50, "f_Age"] = 5
    binData.loc[binData["Age"] > 60, "f_Age"] = 6
    binData.loc[binData["Age"] > 70, "f_Age"] = 7
    binData.loc[binData["Age"] > 80, "f_Age"] = 8

    return binData

def create_sibsp_bins(data):
    binData = data.copy()

    binData.loc[binData["SibSp"] == 0, "f_SibSp"] = 0
    binData.loc[binData["SibSp"] == 1, "f_SibSp"] = 1
    binData.loc[binData["SibSp"] > 1, "f_SibSp"] = 2

    return binData

def create_parch_bins(data):
    binData = data.copy()

    binData.loc[binData["Parch"] == 0, "f_ParCh"] = 0
    binData.loc[binData["Parch"] > 0, "f_ParCh"] = 1

    return binData

def create_sex_bins(data):
    binData = data.copy()

    maleTitles = ["Mr", "Master", "Don", "Rev", "Major", "Sir", "Col", "Capt"]
    femaleTitles = ["Mrs", "Miss", "Mme", "Ms", "Lady", "Mlle", "Countess"]

    binData["Title"] = binData['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)

    binData.loc[binData["Sex"].isna() & binData["Title"].isin(maleTitles), "f_Sex"] = "male"
    binData.loc[binData["Sex"].isna() & binData["Title"].isin(femaleTitles), "f_Sex"] = "female"

    binData.loc[binData["Sex"] == "male", "f_Sex"] = 0
    binData.loc[binData["Sex"] == "female", "f_Sex"] = 1

    return binData

def create_cabin_code(data):
    data["f_CabinCode"] = data["Cabin"].str.extract('(^.{0,1})')
    data.loc[data["f_CabinCode"].isna(), "f_CabinCode"] = "U"

    return data

def create_cabin_bins(data):
    binData = data.copy()

    binData.loc[binData["f_CabinCode"] == "U", "f_Cabin"] = 0
    binData.loc[binData["f_CabinCode"] != "U", "f_Cabin"] = 1

    return binData

def create_embarked_bins(data):
    binData = data.copy()

    data.loc[data["Embarked"].isna(), "Embarked"] = "U"

    binData["f_Embarked"] = 4
    binData.loc[binData["Embarked"] == "C", "f_Embarked"] = 0
    binData.loc[binData["Embarked"] == "S", "f_Embarked"] = 1
    binData.loc[binData["Embarked"] == "Q", "f_Embarked"] = 2
    binData.loc[binData["Embarked"] == "U", "f_Embarked"] = 4

    return binData

def create_features(data, meanFare, meanAge):
  featureData = data.copy()

  featureData = create_fare_bins(featureData, meanFare)
  featureData = create_age_bins(featureData, meanAge)
  featureData = create_sibsp_bins(featureData)
  featureData = create_parch_bins(featureData)
  featureData = create_sex_bins(featureData)
  featureData = create_cabin_code(featureData)
  featureData = create_cabin_bins(featureData)
  featureData = create_embarked_bins(featureData)

  return featureData

In [4]:
# Load initial data

trainData = pd.read_csv(trainDataPath, index_col="PassengerId")
trainTarget = trainData.pop("Survived")


### Pre-process training data

We will remove:

- passengerId - not required
- age (we will initially drop this and look at adding it back in later with some missing data estimates)
- ticket
- cabin

We will convert:

- sex - ordinal encoding


In [5]:
# Preprocessing

meanFare = trainData["Fare"].mean()
meanAge = trainData["Age"].mean()

ppData = create_features(trainData, meanFare, meanAge)

ppData.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,f_Fare,f_Age,f_SibSp,f_ParCh,Title,f_Sex,f_CabinCode,f_Cabin,f_Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.0,2.0,1.0,0.0,Mr,0,U,0.0,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2.0,3.0,1.0,0.0,Mrs,1,C,1.0,0
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,2.0,0.0,0.0,Miss,1,U,0.0,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2.0,3.0,1.0,0.0,Mrs,1,C,1.0,1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0,3.0,0.0,0.0,Mr,0,U,0.0,1


### Creating model

Our first attempt at creating a model will use `Sex`, `Fare` and `Pclass`.

In [32]:
# Train model

features = ["f_Age", "f_Fare", "f_SibSp", "f_ParCh", "f_Sex", "f_Cabin", "Pclass", "f_Embarked"]
model = GradientBoostingClassifier(n_estimators=ESTIMATORS, random_state=SEED_VALUE, max_depth=DEPTH)
model.fit(ppData[features], trainTarget)

RandomForestClassifier(max_depth=20, n_estimators=200, random_state=72)

### Make predictions and save csv

In [39]:
# Make predictions and save csv

testData = pd.read_csv(testDataPath, index_col="PassengerId")
predictionData = create_features(testData, meanFare, meanAge)

predictions = model.predict(predictionData[features])

testData["Survived"] = predictions
testData["Survived"].to_csv(outputPath)