# Pipelining with Titanic Data

### Data loading ...

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
combined = pd.concat([train, test], sort=False).reset_index(drop=True)

#### Some preprocessing of the data

In [None]:
def parse_name(s): 
    a, b = s.split(',')
    family_name = a.strip()
    title = b.split('.')[0].strip()
    first_name = b.split('.')[1].split()[0].strip()
    return (first_name.replace('(', '').replace(')', ''), title, family_name)

def parse_cabin_letter(column):
    letter_pat = re.compile('([A-Za-z])\d?')
    return list(map(lambda x: letter_pat.findall(str(x))[0] if letter_pat.findall(str(x)) else pd.np.nan, column.values.tolist()))

def parse_ticket_number(column):
    number_pat = re.compile('\d{3,}')
    numbers = map(lambda x: number_pat.findall(x)[0] if number_pat.findall(x) else pd.np.nan, column)
    return pd.Series(numbers)


def get_friendship_group(df):
    friendship_group_counter = 0
    if 'family_name' not in df.columns:
        family_names = pd.Series(map(lambda x: parse_name(x)[-1], df.Name))
    else:
        family_names = df.family_name
    cabins = pd.Series(map(parse_cabin_letter, df.Cabin))
    ticket_grouping = []
    for family, count in family_names.value_counts().items():
        family_mask = family_names == family
        
        if count == 1:
            ticket_grouping.append(friendship_group_counter)
            friendship_group_counter += 1
            continue
    
def get_ticket_group(df):
    grouped = df.Ticket.groupby(parse_ticket_number(df.Ticket))
    groups = grouped.apply(lambda x: x.iloc[0])
    df_ = df.copy()
    df_.loc[:, 'ticket_group'] = groups
    return df_

def get_cabin_letter(df):
    df_ = df.copy()
    cabins = parse_cabin_letter(df.Cabin)
    df_.loc[:, 'cabin_na'] = pd.isnull(df.Cabin)
    df_.loc[:, 'cabin'] = cabins
    return df_

def get_is_alone(df):
    df_ = df.copy()
    df_.loc[:, 'is_alone'] = (df.loc[:, 'Parch'] + df.loc[:, 'SibSp'] + 1 == 1)
    return df_

def get_titles(df):
    df_ = df.copy()
    titles = pd.Series(map(lambda x: parse_name(x)[1], df.Name))
    df_.loc[:, 'title'] = titles
    return df_

def discretize_faries(df, ngroups=3):
    df_ = df.copy()
    df_.loc[:, 'fares'] = pd.cut(df_.loc[:,'Fare'], ngroups, labels=False)
    return df_


## Building pipelines

In [None]:
from mlpipes.pfunc import *

preprocessing_pipeline = (('add_groups', get_ticket_group, {}),
                          ('add_cabins', get_cabin_letter, {}),
                          ('add_isalone', get_is_alone, {}),
                          ('add_titles', get_titles, {}),
                          ('convert_fares', discretize_faries, {'ngroups': 3}),
                          ('drop_columns', drop_columns, {'colnames': ('Survived',
                                                                      'PassengerId',
                                                                      'SibSp',
                                                                      'Parch',
                                                                      'Ticket',
                                                                      'Fare',
                                                                     )})
                         )

def process(pipeline, data):
    data_ = data.copy()
    for name, func, kwargs in pipeline:
        print("=========== Step: %s ===========" % name)
        data_ = func(data_, **kwargs)
        print("=" * 40)
    return data_


# Preprocessing steps (feature engeneering)

In [None]:
process(preprocessing_pipeline, combined)

In [None]:
get_ticket_group(combined).loc[:, 'ticket_group'].isnull().sum()

In [None]:
grouped = combined.Ticket.groupby(parse_ticket_number(combined.Ticket))

In [None]:
groups = grouped.apply(lambda x: x.iloc[0])

In [None]:
groups.index