# Pipelining with Titanic Data

### Data loading ...

In [None]:
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster, single, complete
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform 
from collections.abc import Iterable
from mlpipes.pfunc import get_ohe
import pandas as pd
import re
import warnings
import numpy as np
from mlpipes.pfunc import *

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
combined = pd.concat([train, test]).reset_index(drop=True)

#### Some preprocessing of the data

In [None]:
def parse_name(s): 
    a, b = s.split(',')
    family_name = a.strip()
    title = b.split('.')[0].strip()
    first_name = b.split('.')[1].split()[0].strip()
    return (first_name.replace('(', '').replace(')', ''), title, family_name)

def parse_cabin_letter(column):
    letter_pat = re.compile('([A-Za-z])\d+')
    return list(map(lambda x: letter_pat.findall(str(x))[0] if letter_pat.findall(str(x)) else pd.np.nan, column.values.tolist()))

def parse_ticket_number(column):
    number_pat = re.compile('\d{3,}')
    numbers = map(lambda x: number_pat.findall(x)[0] if number_pat.findall(x) else pd.np.nan, column)
    return pd.Series(numbers)


def get_friendship_group(df):
    friendship_group_counter = 0
    if 'family_name' not in df.columns:
        family_names = pd.Series(map(lambda x: parse_name(x)[-1], df.Name))
    else:
        family_names = df.family_name
    cabins = pd.Series(map(parse_cabin_letter, df.Cabin))
    ticket_grouping = []
    for family, count in family_names.value_counts().items():
        family_mask = family_names == family
        
        if count == 1:
            ticket_grouping.append(friendship_group_counter)
            friendship_group_counter += 1
            continue


def get_family_name(df):
    family_names = pd.Series(map(lambda x: parse_name(x)[-1], df.Name))
    df_ = df.copy()
    df_.loc[:,'family_name'] = family_names
    return df_
            
def get_ticket_group(df):
    df_ = df.copy()
    grouped = df.Ticket.groupby(parse_ticket_number(df.Ticket))
    for ind, key in enumerate(grouped.indices):
        df_.loc[grouped.indices[key], 'ticket_group'] = ind if len(grouped.indices[key])>1 else -1
    return df_

def get_cabin_letter(df):
    df_ = df.copy()
    cabins = parse_cabin_letter(df.Cabin)
    df_.loc[:, 'cabin_na'] = pd.isnull(df.Cabin)
    df_.loc[:, 'cabin'] = cabins
    return df_

def get_is_alone(df):
    df_ = df.copy()
    df_.loc[:, 'is_alone'] = (df.loc[:, 'Parch'] + df.loc[:, 'SibSp'] + 1 == 1).astype(int)
    return df_

def get_family_size(df):
    df_ = df.copy()
    df_.loc[:, 'family_size'] = df_.loc[:, 'Parch'] + df_.loc[:, 'SibSp'] + 1
    return df_


def get_titles(df):
    df_ = df.copy()
    titles = pd.Series(map(lambda x: parse_name(x)[1], df.Name))
    df_.loc[:, 'title'] = titles
    return df_

def discretize_faries(df, ngroups=3):
    df_ = df.copy()
    df_.loc[:, 'fares'] = pd.cut(df_.loc[:,'Fare'], ngroups, labels=False)
    return df_

def discretize_ages(df, ngroups=3):
    df_ = df.copy()
    df_.loc[:, 'Age'] = pd.cut(df_.loc[:,'Age'], ngroups, labels=False)
    return df_

def estimate_age(df):
    estimates = []
    for ind, row in df.loc[df.Age.isnull(), :].iterrows():
        # NOTE: Could be rewritten using vectorized notation
        if row.title in ['Master', 'Mr', 'Miss', 'Rev', 'Dr']:
            estimates.append(df.groupby(['title', 'Sex']).median().loc[[row.title, row.Sex],'Age'].values[0])
        else:
            estimates.append(df.groupby(['Sex', 'Pclass']).median().loc[[row.Sex], 'Age'].values[0])
    df_ = df.copy()
    df_.loc[df.Age.isnull(), 'Age'] = estimates
    return df_


def get_cabin_groups(df):
    num_pat = re.compile('\d+')
    let_pat = re.compile('[a-zA-Z]')
    LONG_DISTANCE = 5
    MEDIUM_DISTANCE = 4
    NORMAL_DISTANCE = 3
    SMALL_DISTANCE = 2
    LOW_DISTANCE = 1
    EQUAL = 0
    def cabin_distance(u, v):
        _u, _v = u[0], v[0]
        if not isinstance(_u, Iterable) or not isinstance(_v, Iterable):
            return LONG_DISTANCE
        unums = list(map(int, sum(map(num_pat.findall, _u), [])))
        vnums = list(map(int, sum(map(num_pat.findall, _v), [])))
        ulets = list(sum(map(let_pat.findall, _u), []))
        vlets = list(sum(map(let_pat.findall, _v), []))
        if not(unums and vnums):
            if set(ulets).intersection(vlets):
                return EQUAL
            else:
                return MEDIUM_DISTANCE
        if u == v:
            return EQUAL
        if set(_u).intersection(set(_v)):
            return LOW_DISTANCE
        if not set(ulets).intersection(set(vlets)):
            return MEDIUM_DISTANCE
        else:
            for p in _u:
                for q in _v:
                    try:
                        pval = list(map(int, num_pat.findall(p)))[0]
                        qval = list(map(int, num_pat.findall(q)))[0]
                        if p[0] == q[0] and (abs(pval - qval) <= 2):
                            return SMALL_DISTANCE
                    except IndexError:
                        pass
            return NORMAL_DISTANCE
        return MEDIUM_DISTANCE
    distances = pdist(df.Cabin.apply(lambda x: x.split() if not isinstance(x, float) else x).values[:, np.newaxis], cabin_distance)
    df_ = df.copy()
    df_.loc[:, 'cabin_group'] = fcluster(complete(distances), SMALL_DISTANCE, criterion='distance')
    df_.cabin_group = df_.groupby('cabin_group')['cabin_group'].transform(lambda x: x if len(x)>1 else pd.Series([-1]*len(x)))
    return df_


def combine_titles(df):
    df_ = df.copy()
    df_['title'] = df_['title'].replace(['Mlle'], 'Miss')
    df_['title'] = df_['title'].replace(['Ms'], 'Miss')
    df_['title'] = df_['title'].replace(['Mme'], 'Mrs')
    df_['title'] = df_['title'].replace(['Lady', 'the Countess', 'Capt', 'Col',
                                         'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'rare')
    return df_


def simple_encoder(df):
    df_ = df.copy()
    sex_mapping = {'male': 0, 'female':1}
    embarked_mapping = {'S':0, 'Q':1, 'S':2}
    df_.Embarked = df_.Embarked.map(embarked_mapping)
    df_.Sex = df_.Sex.map(sex_mapping)
    return df_

def label_encode(df, **kwargs):
    return get_le(df, **kwargs)[0]

## Building pipelines

In [None]:
preprocessing_pipeline = (('add_groups', get_ticket_group, {}),
                          ('add_family_size', get_family_size, {}),
                          ('add_titles', get_titles, {}),
                          ('convert_fares', discretize_faries, {'ngroups': 3}),
                          ('add_ticket_group', get_ticket_group, {}),
                          ('fill_embarked', fill_na_simple, {'colnames': ('Embarked',),
                                                             'methods': (lambda x: pd.Series(x).mode()[0],)}),
                          ('add_family_name', get_family_name, {}),
                          ('add_ages', estimate_age, {}),
                          ('convert_ages', discretize_ages, {'ngroups': 5}),
                          ('add_cabin_groups', get_cabin_groups, {}),
                          ('combine_titles', combine_titles, {}),
                          ('sex_encoder', simple_encoder, {}),
                          ('drop_columns', drop_columns, {'colnames': ('Survived',
                                                                       'PassengerId',
                                                                       'SibSp',
                                                                       'Parch',
                                                                       'Ticket',
                                                                       'Fare',
                                                                       'family_name',
                                                                       'Name',
                                                                       'Cabin',
                                                                       )}),
                          ('get_le', label_encode, {'colnames': ('ticket_group', 'cabin_group',
                                                                 'title', 'Embarked')})
                         )

def process(pipeline, data):
    data_ = data.copy()
    for name, func, kwargs in pipeline:
        print("=========== Step: %s ===========" % name)
        data_ = func(data_, **kwargs)
        print("=" * 40)
    return data_


# Preprocessing steps (feature engeneering)

In [None]:
processed  = process(preprocessing_pipeline, combined)

In [None]:
processed.head()

In [None]:
clf = GradientBoostingClassifier(random_state=42)
#clf = SVC(class_weight='balanced', cache_size=8000)


X = processed.iloc[:train.shape[0]].values
parameters_RF = {'n_estimators': [20,60,80,100,120,200,300,800],
              'max_depth': (None, 2, 3, 5, 7, 10, 15),
              'criterion': ('gini', 'entropy'),
              'max_features': ('auto', 'log2', None),
              'oob_score': (True, False)
             }


parameters_SVC = {'kernel': ['linear', ],
                  'C': randint(1, 1000),
                  'shrinking': [True, False],
                 }

parameters_GB = {'n_estimators': randint(20, 300),
                 'max_depth': randint(3, 20),
                 'subsample': uniform(0.7, 0.3),
                 'learning_rate': uniform(0.0001, 0.3),
                 'max_features': ('auto', 'log2', None),
                 'min_samples_leaf': randint(3, 10),
                 'min_samples_split' : randint(2, 10)
                }

y = train.Survived.values

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clfcv = RandomizedSearchCV(clf, param_distributions=parameters_GB, n_iter=1000, scoring='f1',
                               cv=3, verbose=1, n_jobs=1)
    clfcv.fit(X, y)

In [None]:
cross_val_score(clfcv.best_estimator_, X, y, cv=5, scoring='f1')

In [None]:
clfcv.best_score_

In [None]:
result_df = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': clfcv.best_estimator_.predict(processed.iloc[train.shape[0]:].values)})

In [None]:
result_df.to_csv('output.csv', index=False)

# Simple Neural Network 

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras import backend as K
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()


BATCH_SIZE = 80
NUM_CLASSES = 2
INPUT_SHAPE = processed.shape[1]
EPOCHS = 2000

# ------------- Building the model ----------------
model = Sequential()
model.add(Dense(20, input_shape=(INPUT_SHAPE,), activation='sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(10, activation='relu'))
model.add(Dense(7, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(NUM_CLASSES-1, activation='sigmoid'))
# -------------------------------------------------


# ---- Building the training and test data --------
X = processed.iloc[:train.shape[0]].values
Y = train.Survived.values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)

#y_train = keras.utils.to_categorical(y_train, 3)
#y_test = keras.utils.to_categorical(y_test, 3)
# -------------------------------------------------



In [None]:
model.summary()

In [None]:
# ----------- Fitting the model -------------------
model.compile(keras.optimizers.Adagrad(), loss=keras.losses.mse, metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=0,
          validation_split=0.4)
# -------------------------------------------------