In [None]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/20201.ai4eng/master/init.py
import init; init.init(force_download=False); init.get_weblink()

In [49]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import local.lib.mlutils
import pandas as pd
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
%matplotlib inline

# We use [Titanic Data at Kaggle](https://www.kaggle.com/c/titanic)

- Register to Kaggle
- Download the `train.csv` and `test.csv` files

In [50]:
d = pd.read_csv("train.csv")
print (d.shape)

In [51]:
d.head()

**Understand `NaN` values are present**

In [52]:
for i in d.columns:
    print ("%20s"%i, np.sum(d[i].isna()))

In [53]:
d.Embarked.value_counts()

In [54]:
plt.hist(d.Age.dropna().values, bins=30);

**Remove uninformative columns**

In [55]:
del(d["PassengerId"])
del(d["Name"])
del(d["Ticket"])
del(d["Cabin"])

**Fix `NaN` values**

- observe the different filling policies we decide to have

In [56]:
d["Embarked"] = d.Embarked.fillna("N")
d["Age"]      = d.Age.fillna(d.Age.mean())
d.head()

In [57]:
plt.hist(d.Age.dropna().values, bins=30);

**Turn categorical columns to a `one_hot` encoding**

In [58]:
def to_onehot(x):
    values = np.unique(x)
    r = np.r_[[np.argwhere(i==values)[0][0] for i in x]]
    return np.eye(len(values))[r].astype(int)
    
k = to_onehot(d.Embarked.values)
k[:5]

In [59]:
def replace_columns_with_onehot(d, col):
    k = to_onehot(d[col].values)
    r = pd.DataFrame(k, columns=["%s_%d"%(col, i) for i in range(k.shape[1])], index=d.index).join(d)
    del(r[col])
    return r 

In [60]:
d.head()

In [61]:
d = replace_columns_with_onehot(d, "Embarked")
d.head()

In [62]:
d = replace_columns_with_onehot(d, "Sex")
d.head()

In [63]:
d.shape, d.values.sum()

### Put all transformations together

In [64]:
def clean_titanic(d):
    del(d["PassengerId"])
    del(d["Name"])
    del(d["Ticket"])
    del(d["Cabin"])
    d["Embarked"] = d.Embarked.fillna("N")
    d["Fare"]     = d.Fare.fillna(d.Fare.mean())
    d["Age"]      = d.Age.fillna(d.Age.mean())
    d = replace_columns_with_onehot(d, "Embarked")
    d = replace_columns_with_onehot(d, "Sex")
    return d

**transform train and test data together**

- observe that test data **does not have** a `Survival` column. This is the result to submit to Kaggle

In [67]:
dtr = pd.read_csv("train.csv")
dts = pd.read_csv("test.csv")
lentr = len(dtr)
dtr.shape, dts.shape

In [68]:
dts.head()

**get data ready for training**

In [70]:
source_cols = [i for i in dtr.columns if i!="Survived"]
all_data = pd.concat((dtr[source_cols], dts[source_cols]))
all_data.index = range(len(all_data))
all_data = clean_titanic(all_data)

Xtr, ytr = all_data.iloc[:lentr].values, dtr["Survived"].values
Xts      = all_data.iloc[lentr:].values

print (Xtr.shape, ytr.shape)
print (Xts.shape)

**cross validate for model selection**

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier()
print (cross_val_score(rf, Xtr, ytr))

svc = SVC()
print (cross_val_score(svc, Xtr, ytr))

**now train with full dataset and generate submission for Kaggle**

In [73]:
rf.fit(Xtr, ytr)
preds_ts = rf.predict(Xts)
preds_ts

**get predictions ready to submit to Kaggle**

- see https://www.kaggle.com/c/titanic#evaluation for file format

In [74]:
submission = pd.DataFrame([dts.PassengerId, pd.Series(preds_ts, name="Survived")]).T
submission.head()

In [77]:
submission.to_csv("titanic_kaggle.csv", index=False)

In [78]:
!head titanic_kaggle.csv