# Titanic: Machine Learning from Disaster

In [None]:
# https://www.kaggle.com/c/titanic

In [1]:
import sklearn
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_dummies = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])
test_dummies = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'])
combined = [train_dummies, test_dummies]
# Добавим признаки Размер семьи и путешествовал ли человек один
for dataset in combined:   
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone']  = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

## Preprocess data

In [5]:
# Extract features
X_train = train_dummies.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
X_test = test_dummies.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [6]:
# Extract targets
y_train = train_dummies['Survived']

In [7]:
# Fill NA values
from sklearn.preprocessing import Imputer
imp = Imputer()
imp.fit(X_train)
X_train_imp = imp.transform(X_train)
X_test_imp = imp.transform(X_test)
# Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

In [8]:
# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_imp)
X_train_scaled = scaler.transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

# StandardScaler(copy=True, with_mean=True, with_std=True)

## Train model and make predictions

In [9]:
# Fit logistic regression
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10000, max_depth=5, min_samples_split=6)

clf.fit(X_train_scaled, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [10]:
# TODO: make predictions

predicted = clf.predict(X_test_scaled)

## Estimate quality

In [12]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, accuracy_score

kfold = KFold(n_splits=4, shuffle=True, random_state=321)
score = cross_val_score(clf,
                       X_train_scaled, y_train, groups=None,
                       scoring = make_scorer(accuracy_score), cv = kfold)
np.mean(score)

# Kaggle score 0,79426


0.8237839857795014

## Create submission 

In [11]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))