# Titanic Survival Predictions

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import utils

RANDOMSEED = 0

## 1. Evaluation with only the training data

#### Import data and test/train split

In [2]:
dataset_train = pd.read_csv(r'data\train.csv')
X = dataset_train.drop("Survived",axis=1)
y = dataset_train["Survived"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size = 0.2, random_state = RANDOMSEED)

#### Clean the data

In [3]:
X_train, X_test = utils.clean(X_train, X_test)
X_train.columns

Index(['SibSp', 'Embarked_S', 'Sex_female', 'Sex_male', 'PassengerId', 'Age',
       'Pclass', 'Fare', 'Parch', 'Embarked_C', 'Embarked_Q'],
      dtype='object')

#### Train the model

In [4]:
model = utils.model(X_train, y_train)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.6, 0.7, 0.8, 0.9], 'min_samples_leaf': [2, 3, 4, 5], 'min_samples_split': [10, 12, 18, 25, 30], 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

#### Evaluate the model

In [5]:
predictions = model.predict(X_test)
accuracy = accuracy_score(predictions, y_test)
print(accuracy)

0.849162011173


## 2. Build model on all of training data and make predictions about testing data

#### Import the data

In [6]:
dataset_train = pd.read_csv(r'data\train.csv')
X_train = dataset_train.drop("Survived",axis=1)
y_train = dataset_train["Survived"]
X_test = pd.read_csv(r'data\test.csv')

#### Clean the data

In [7]:
X_train, X_test = utils.clean(X_train, X_test)
X_train.columns

Index(['SibSp', 'Embarked_S', 'Sex_female', 'Sex_male', 'PassengerId', 'Age',
       'Pclass', 'Fare', 'Parch', 'Embarked_C', 'Embarked_Q'],
      dtype='object')

#### Train the model

In [8]:
model = utils.model(X_train, y_train)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.6, 0.7, 0.8, 0.9], 'min_samples_leaf': [2, 3, 4, 5], 'min_samples_split': [10, 12, 18, 25, 30], 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

#### Make predictions

In [9]:
predictions = model.predict(X_test)
pd.DataFrame(predictions).to_csv("predictions.csv")