In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

titanic = pd.read_csv("train.csv")

# Data cleaning process
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())


titanic.loc[titanic["Sex"] == "male","Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == 'S', "Embarked"] = 0
titanic.loc[titanic["Embarked"] == 'C', "Embarked"] = 1
titanic.loc[titanic["Embarked"] == 'Q', "Embarked"] = 2


# Data Analysis
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [3]:


# initialize our algorithm - Random Forest
# random_state splits the data ou can guarantee that your split will be always the same. 
#      This is useful if you want reproducible results, for example in testingor for consistency in the documentation 
#      (so that everybody can see the same numbers)
# n_estimators : number of trees we want to make
# min_samples_split : minimum number of rows we need to make split
# min_samples_leaf : minimum number of samples at the branch end
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=2, min_samples_leaf=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()


alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

0.81930415263748602

In [25]:
### Generate New features
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic["Name"].apply(lambda x: len(x))

In [5]:
### Using Titles
import re

def get_title(name):
    
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search :
        return title_search.group(1)
    return ""

titles = titanic["Name"].apply(get_title)
print(pd.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, 
                 "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8,
                 "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}

for k,v in title_mapping.items():
    titles[titles == k] = v

print(pd.value_counts(titles))

titanic["Title"] = titles
                       

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Capt          1
Jonkheer      1
Don           1
Ms            1
Lady          1
Mme           1
Countess      1
Sir           1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64


In [6]:
### Grouping by family name
import operator

# A dictonary family name to id
family_id_mapping = {}

#A function to get the id given a row
def get_family_id(row):
    last_name = (row["Name"].split(','))[0]
    # Create a family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else :
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1)))[1] + 1
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

family_ids = titanic.apply(get_family_id, axis=1)

family_ids[titanic['FamilySize'] < 3] = -1

print(pd.value_counts(family_ids))
titanic['FamilyId'] = family_ids

-1      800
 14       8
 149      7
 63       6
 50       6
 59       6
 17       5
 384      4
 27       4
 25       4
 162      4
 8        4
 84       4
 340      4
 43       3
 269      3
 58       3
 633      2
 167      2
 280      2
 510      2
 90       2
 83       1
 625      1
 376      1
 449      1
 498      1
 588      1
dtype: int64


In [22]:
########### Feature Engineering
###### Selecting the best features
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', "Parch", "Fare", "Embarked", 
             "FamilySize", "Title", "FamilyId"]

# Feature Selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])

scores = -np.log10(selector.pvalues_)
print(scores)

[ 24.59567142  68.85199425   1.27768955   0.5342545    1.82976043
  14.21323514   2.85130099   0.20768458  26.98338607   1.87160041]


In [23]:
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation="vertical")
plt.show()

In [24]:
# Pick only the four best features.
# predictors = ["Pclass", "Sex", "Fare", "Title","Embarked","Parch"]

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

0.83277216610549942