# Pre-requisites

1. [Install conda or miniconda](https://conda.io/docs/user-guide/install/index.html)

2. Create an environment
```
$ conda create -n ENV_NAME python=3.6
$ source activate ENV_NAME
# check your work - paths should be in ENV_NAME
$ which jupyter
...
$ which ipython
...
```

3. Install deps
```
$ pip install quilt # data package manager
$ conda install scikit-learn # handles numpy scipy
```

3. Install data, tag "original"
```
$ quilt install akarve/pydata_book/titanic:original
```

3. Fire up Jupyter
```
jupyter notebook
```
In the JupyterHub UI: `New > Python3`

# Acquire raw training data
[How to build your own data package](https://docs.quiltdata.com/make-a-package.html)

In [None]:
import quilt
quilt.install("akarve/pydata_book/titanic", tag="original", force=True)

In [None]:
from quilt.data.akarve import pydata_book as pb
pb.titanic

# Feature engineering

In [None]:
train = pb.titanic.train()
test = pb.titanic.test()

In [None]:
# check for nulls
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
test['IsMale'] = (test['Sex'] == 'male').astype(int)
train['IsMale'] = (train['Sex'] == 'male').astype(int)
test['NumRelatives'] = test['SibSp'] + test['Parch']
train['NumRelatives'] = train['SibSp'] + train['Parch']
features = ['Pclass', 'IsMale', 'Age', 'NumRelatives']

In [None]:
# imputation
age_median = train['Age'].median()
age_mean = train['Age'].mean()

In [None]:
age_mean

In [None]:
train['AgeImputeMean'] = train['Age'].fillna(age_mean)
test['AgeImputeMean'] = test['Age'].fillna(age_mean)

In [None]:
import pandas as pd
# select four features we care about
features = ['Pclass', 'IsMale', 'NumRelatives', 'AgeImputeMean']
# store updated data
pb._set(['titanic', 'features'], pd.DataFrame([features]))

In [None]:
quilt.build("USER/PKG", pb)
# send the latest to quilt
# quilt.push("USER/PKG")

# Training

In [None]:
features = pb.titanic.features()
train = pb.titanic.train()
trainsub = train[features.values[0]]

In [None]:
trainvecs = trainsub.values
trainlabels = train['Survived'].values

## train

In [None]:
from sklearn.model_selection import cross_val_score as cvs
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=3, random_state=0)
#rfc.fit(trainvecs, trainlabels)
scores = cvs(rfc, trainvecs, trainlabels, cv=5)
scores.mean()

## serialize

In [None]:
from sklearn.externals import joblib
joblib.dump(rfc, 'rfc.pkl') 

## load (and infer)

In [None]:
from sklearn.externals import joblib
model = joblib.load("rfc.pkl")
cvs(model, trainvecs, trainlabels, cv=5).mean()

# Data package construction