# MAIN PYTHON DEPLOYMENT PIPELINE

#### LOAD LIBRARIES

In [3]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [4]:
import json

#### READ DATA

In [5]:
# import data
titanic = pd.read_csv('http://collaboratescience.com/cheatsheets/titanic.csv')

#### GET RANDOM PASSENGER

In [11]:
# GET /get_random_passenger

random_passenger = titanic.sample(n=1)

res = {
    "name" : random_passenger['name'],
    "parch" : random_passenger['parch'],
    "fare" : random_passenger['fare'],
    "age" : random_passenger['age'],
    "sibsp" : random_passenger['sibsp']
    #"data" : {"keys" : list(random_result[2]), "values" : random_result[2].iloc[0].tolist()}
}

print(json.dumps(random_passenger.to_json(orient='records')))

"[{\"pclass\":3.0,\"survived\":1.0,\"name\":\"Nicola-Yarred, Miss. Jamila\",\"sex\":\"female\",\"age\":14.0,\"sibsp\":1.0,\"parch\":0.0,\"ticket\":\"2651\",\"fare\":11.2417,\"cabin\":null,\"embarked\":\"C\",\"boat\":\"C\",\"body\":null,\"home.dest\":null}]"


In [7]:
titanic['family count']

KeyError: 'family count'

### TEAM # Submission

#### DATA PREPARATION

In [None]:
# Drop body and cabin since more than 50% of the data is missing
titanic.drop(['cabin', 'body'], axis = 1, inplace = True)

In [None]:
# Drop boat, home.dest and use complete case analysis to drop rows with missing data
titanic.drop(['boat', 'home.dest'], axis = 1, inplace = True)
titanic.dropna(inplace = True)

In [None]:
# Create a new column FamilyCount which is the sum of SibSp and Parch
titanic["family_count"] = titanic['sibsp'] + titanic['parch']

In [None]:
# Create a title column using RegEx from the name
title  =  titanic['name'].str.extract(r', ([^\.]+)\.')
titanic['title'] = title

In [None]:
# Take the log of fare to make it more normally distributed 
titanic['fare'] = titanic['fare'].map(lambda x: np.log1p(x))

In [None]:
# One hot encode categorical values
titanic = pd.get_dummies(titanic, columns = ['embarked', 'sex', 'title'], drop_first = True)

In [None]:
# Define features and target in train data
X = titanic.drop(['name', 'ticket', 'survived'], axis = 1)
y = titanic['survived']

In [None]:
# Standardize the predictors in X
ss = StandardScaler()
ss.fit(X) 
X_scaled = ss.transform(X)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y , test_size = 0.3)

#### MODEL BUILDING

In [None]:
# Gridsearch on logistic regression model 
lr_params = {'penalty':['l1', 'l2'], 
             'C': np.logspace(-5, 2, 10)}
gs = GridSearchCV(LogisticRegression(), param_grid = lr_params)
gs.fit(X_train, y_train)

# Results 
gs.best_score_, gs.best_params_