# Titanic

Knowledge competition. [Predicts whether the person will survive or die on Titanic](https://www.kaggle.com/c/titanic/)

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML, clear_output

data_train = pd.DataFrame.from_csv("data/train.csv", index_col='PassengerId')
data_test  = pd.DataFrame.from_csv("data/test.csv",  index_col='PassengerId')

In [2]:
print data_train.shape
data_train.head()

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Not a lot of data. On top of this some columns most probably will not be helpful (`name`, `cabin`, `ticket`). Now will investigate missing values.

In [3]:
def count_missing(data):
    res = []
    for column in data:
        n = data[column].isnull().sum()
        if n:
            res.append((column, n))
    return pd.DataFrame(res, columns=['Column', 'Num of Nan']).set_index(['Column'])

display(count_missing(data_train))
display(count_missing(data_test))

Unnamed: 0_level_0,Num of Nan
Column,Unnamed: 1_level_1
Age,177
Cabin,687
Embarked,2


Unnamed: 0_level_0,Num of Nan
Column,Unnamed: 1_level_1
Age,86
Fare,1
Cabin,327


It looks like `Cabin` information is extremely sparse. On top of this I do not belive it will be helpful. I will also impute Age information from [online encyclopedia](https://www.encyclopedia-titanica.org/)

In [4]:
data_test.drop('Cabin', axis=1, inplace=True)
data_train.drop('Cabin', axis=1, inplace=True)

So now we have a lot of `Age` missing. Few `Embarked` and one `Fare`. There is a lot of data [written here](https://www.encyclopedia-titanica.org/). So let's see all the missing embarked fields.

In [5]:
data_train.loc[62, 'Embarked'] = 'S'
data_train.loc[830, 'Embarked'] = 'S'
data_test.loc[1044, 'Fare'] = 0

In [6]:
for id, age in [
    (6, 28), (18, 23), (20, 22), (27, 26), (29, 24), (30, 23), (32, 48), (33, 18), (37, 18),
    (43, 50), (46, 30), (47, 20), (48, 27), (49, 16), (56, 46), (65, 64), (66, 7), (77, 23), 
    (78, 28), (83, 31), (88, 26), (96, 22), (102, 29), (108, 29), (110, 28), (122, 19), (127, 16), (129, 2), (141, 40),
    (155, 27), (159, 37), (160, 5), (167, 48), (169, 48), (177, 5), (181, 8), (182, 39), (186, 39), (187, 27),
    (197, 28), (199, 21), (202, 17), (215, 22), (224, 22), (236, 12)
]:
    data_train.loc[id, 'Age'] = age

It is actually super boring. I will try to extrapolate age from the name. Miss. suggests that she is < 16.

In [7]:
medial_male = int(data_train[data_train['Sex'] == 'male'][['Age']].mean()['Age'])
medial_female = int(data_train[data_train['Sex'] == 'female'][['Age']].mean()['Age'])
def guessAge(name, sex):
    age = medial_female if sex == 'female' else medial_male
    if 'miss' in name.lower():
        age = 16
        
    if 'jr' in name.lower():
        age = 20
    
    if name.count('"') >= 2:
        age -= 5
    
    return age 

for index, row in data_train[data_train['Age'].isnull()][['Name', 'Sex', 'SibSp', 'Parch']].iterrows():
    data_train.loc[index, 'Age'] = guessAge(row['Name'], row['Sex'])
    
for index, row in data_test[data_test['Age'].isnull()][['Name', 'Sex', 'SibSp', 'Parch']].iterrows():
    data_test.loc[index, 'Age'] = guessAge(row['Name'], row['Sex'])

Translate `Sex` into 0, 1 values and also categorize `Embarked` column.

In [8]:
data_train['Sex'] = data_train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data_train.rename(columns={'Sex': 'IsMale'}, inplace=True)

data_test['Sex'] = data_test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data_test.rename(columns={'Sex': 'IsMale'}, inplace=True)

In [9]:
titles = {
    "Capt":       "officer",
    "Col":        "officer",
    "Major":      "officer",
    "Dr":         "officer",
    "Rev":        "officer",

    "Jonkheer":   "royalty",
    "Don":        "royalty",
    "Sir" :       "royalty",
    "the Countess":"royalty",
    "Dona":       "royalty",
    "Master" :    "royalty",
    "Lady" :      "royalty",

    "Mme":        "young",
    "Mlle":       "young",
    "Miss" :      "young",

    "Ms":         "other",
    "Mr" :        "other",
    "Mrs" :       "other",
} 
data_train['Title'] = data_train['Name'].apply(lambda x: titles[x.split(',')[1].split('.')[0].strip()])
data_test['Title']  = data_test['Name'].apply(lambda x: titles[x.split(',')[1].split('.')[0].strip()])

In [10]:
data_test['FamilySize'] = data_test['SibSp'] + data_test['Parch']
data_train['FamilySize'] = data_train['SibSp'] + data_train['Parch']

Remove all unnecessary columns.

In [11]:
data_test.drop(['Name', 'Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)
data_train.drop(['Name', 'Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)

In [12]:
data_train.head()

Unnamed: 0_level_0,Survived,Pclass,IsMale,Age,Fare,Embarked,Title,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,7.25,S,other,1
2,1,1,0,38.0,71.2833,C,other,1
3,1,3,0,26.0,7.925,S,young,0
4,1,1,0,35.0,53.1,S,other,1
5,0,3,1,35.0,8.05,S,other,0


Transform categorical features

In [13]:
Y = data_train['Survived'].values
data_train.drop(['Survived'], axis=1, inplace=True)

In [14]:
tmp = pd.concat([data_train, data_test], axis=0)

In [15]:
# http://stackoverflow.com/q/32256415/1090562
tmp = pd.concat([
    tmp, 
    pd.get_dummies(tmp['Embarked'], prefix='Embarked'),
    pd.get_dummies(tmp['Title'], prefix='Title'),
    pd.get_dummies(tmp['Pclass'], prefix='Pclass')
], axis=1)
tmp.drop(['Embarked', 'Title', 'Pclass'], axis=1, inplace=True)

In [16]:
train_rows = data_train.shape[0]
data_train = tmp.iloc[:train_rows, :]
data_test = tmp.iloc[train_rows:, :] 

[Example of nice plots](http://nbviewer.jupyter.org/github/agconti/kaggle-titanic/blob/master/Titanic.ipynb#Let's-take-a-Look-at-our-data-graphically:)

### Extract the data and preprocessing

In [17]:
X = data_train.values
X_ = data_test.values

In [18]:
X1 = X
X1_ = X_
#from sklearn import preprocessing

#min_max_scaler = preprocessing.MinMaxScaler()
#X1 = min_max_scaler.fit_transform(X)
#X1_= min_max_scaler.transform(X_)

In [19]:
from sklearn import linear_model, svm, neighbors, ensemble, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from datetime import datetime
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X1, Y, test_size=0.40)

estimators = [
    (linear_model.LogisticRegression(),             'Logistic'),
    (linear_model.RidgeClassifier(),                'Ridge'),
    (linear_model.PassiveAggressiveClassifier(),    'Passive aggressive'),
    (svm.SVC(),                                     'SVC rbf'),
    (svm.SVC(kernel="sigmoid"),                     'SVC sigmoid'),
    (svm.LinearSVC(),                               'SVC linear'),
    (svm.NuSVC(),                                   'SVC Nu'),
    (linear_model.SGDClassifier(),                  'SGD'),
    (neighbors.NearestCentroid(),                   'Nearest centroid'), 
    (ensemble.RandomForestClassifier(),             'Random forest'),
    (ensemble.AdaBoostClassifier(),                 'Ada Boost'),
    (ensemble.GradientBoostingClassifier(),         'Gradient Boosting'),
    (ensemble.BaggingClassifier(),                  'Bagging'),
    (ensemble.ExtraTreesClassifier(),               'Extra tree'),
    (xgb.XGBClassifier(),                           'XGB'),
    (naive_bayes.GaussianNB(),                      'GaussianNB'),
]

def analyse_many(estimators):
    all_values = []
    for estimator, name in estimators:
        startTime = datetime.now()
        clf = estimator.fit(X_train, y_train)

        score = accuracy_score(y_test, clf.predict(X_test))
        print name
        print '  ', score, '\t', datetime.now() - startTime
        all_values.append((score, name, (datetime.now() - startTime).total_seconds()))
    
    clear_output()
    all_values.sort(reverse=True)
    return all_values[0][0], pd.DataFrame(all_values, columns=['Score', 'Name', 'Time seconds']).set_index(['Score'])

In [20]:
best_score, estimator_data = analyse_many(estimators)
estimator_data

Unnamed: 0_level_0,Name,Time seconds
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
0.826331,Logistic,0.00413
0.823529,XGB,0.039281
0.823529,Ridge,0.001741
0.817927,Gradient Boosting,0.093008
0.812325,Ada Boost,0.106426
0.795518,Random forest,0.026536
0.770308,Bagging,0.032119
0.767507,Extra tree,0.059667
0.756303,GaussianNB,0.00144
0.708683,SVC rbf,0.023587


It looks like `Gradient Boosting`, `XGB`, `Ada Boost` perform reasonably good together with `Logistic L2`, `SVC linear`. Let's see whether **0.851541** can be pushed further.

In [21]:
clf = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=10000).fit(X_train, y_train)
score = accuracy_score(y_test, clf.predict(X_test))
print score

0.792717086835


It looks like ensembe with 100 elements had a lot of variance and we just got lucky. 

----
### Hyperparameters tuning.

In [22]:
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.grid_search import GridSearchCV

In [23]:
def kFold_gridSearch(model, X, Y, params, folds=50):
    startTime = datetime.now()

    k_fold = KFold(len(Y), n_folds=folds, shuffle=True, random_state=0)
    clf = GridSearchCV(model, params, cv=k_fold, scoring='accuracy')
    clf.fit(X, Y)
    
    param_keys = params.keys()
    
    data_scores = [[info[1]] + [info[0][k] for k in param_keys] for info in clf.grid_scores_]
    data_scores.sort(reverse=True)
    
    time_took = datetime.now() - startTime
    if time_took.total_seconds() > 5:
        print time_took
    
    return pd.DataFrame(data_scores, columns=['Score'] + param_keys).set_index(['Score'])

In [24]:
grid_search_data = [
    (ensemble.GradientBoostingClassifier(), 'Gradient Boosting', {
        'loss': ['deviance', 'exponential'],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 500],
        'max_depth': [3, 5]
    }),
    (ensemble.AdaBoostClassifier(), 'Ada Boost', {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 1.0]
    }),
    (linear_model.LogisticRegression(), 'Logistic', {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 0.5, 1.]
    }),
    (xgb.XGBClassifier(), 'XGB', {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 300]
    })
]

for model, model_name, params in grid_search_data:
    print model_name
    display(kFold_gridSearch(model, X, Y, params).head())

Gradient Boosting
0:05:40.833064


Unnamed: 0_level_0,n_estimators,loss,learning_rate,max_depth
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.841751,100,exponential,0.1,5
0.839506,500,exponential,0.01,5
0.838384,500,deviance,0.01,5
0.833895,100,deviance,0.01,5
0.832772,100,deviance,0.1,5


Ada Boost
0:00:33.833903


Unnamed: 0_level_0,n_estimators,learning_rate
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
0.821549,100,0.1
0.813692,50,1.0
0.81257,50,0.1
0.811448,100,1.0


Logistic


Unnamed: 0_level_0,penalty,C
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
0.826038,l1,0.5
0.824916,l1,1.0
0.819304,l2,1.0
0.819304,l2,0.5
0.804714,l2,0.1


XGB
0:01:32.118666


Unnamed: 0_level_0,n_estimators,learning_rate,max_depth
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.843996,300,0.01,7
0.842873,100,0.01,7
0.839506,300,0.01,5
0.835017,100,0.01,5
0.830527,300,0.1,3


In [25]:
grid_search_data = [
    (xgb.XGBClassifier(), 'XGB', {
        'max_depth': [4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.001],
        'n_estimators': [300, 400, 350, 450]
    })
]

for model, model_name, params in grid_search_data:
    print model_name
    display(kFold_gridSearch(model, X, Y, params).head())
    print

XGB
0:13:11.715955


Unnamed: 0_level_0,n_estimators,learning_rate,max_depth
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.843996,300,0.01,7
0.842873,350,0.01,7
0.842873,300,0.01,6
0.841751,450,0.001,5
0.841751,400,0.01,7





-----
### Preparing for submission
It looks like XGBoost with `400, 0.01, 5` has the best result.

In [26]:
clf = xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=7)
clf.fit(X1, Y)
res = clf.predict(X1_).astype(int)

In [27]:
submission = pd.DataFrame({"PassengerId": xrange(892, 892 + X1_.shape[0]), "Survived": res})
submission.to_csv("output.csv", index=False)