# Pipelining with Titanic Data

### Data loading ...

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import re
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
combined = pd.concat([train, test]).reset_index(drop=True)

#### Some preprocessing of the data

In [3]:
def parse_name(s): 
    a, b = s.split(',')
    family_name = a.strip()
    title = b.split('.')[0].strip()
    first_name = b.split('.')[1].split()[0].strip()
    return (first_name.replace('(', '').replace(')', ''), title, family_name)

def parse_cabin_letter(column):
    counts = column.value_counts()
    letter_pat = re.compile('([A-Za-z])\d?')
    return list(map(lambda x: letter_pat.findall(x)[0] if letter_pat.findall(x) else pd.np.nan, column.values.tolist()))

def parse_ticket_number(column):
    number_pat = re.compile('\d{3,}')
    numbers = map(lambda x: number_pat.findall(x)[0] if number_pat.findall(x) else pd.np.nan, column)
    return pd.Series(numbers)


def get_friendship_group(df):
    friendship_group_counter = 0
    if 'family_name' not in df.columns:
        family_names = pd.Series(map(lambda x: parse_name(x)[-1], df.Name))
    else:
        family_names = df.family_name
    cabins = pd.Series(map(parse_cabin_letter, df.Cabin))
    ticket_grouping = []
    for family, count in family_names.value_counts().items():
        family_mask = family_names == family
        
        if count == 1:
            ticket_grouping.append(friendship_group_counter)
            friendship_group_counter += 1
            continue
    
def get_ticket_group(df):
    grouped = df.Ticket.groupby(parse_ticket_number(combined.Ticket))
    groups = grouped.apply(lambda x: x.iloc[0])
    df_ = df.copy()
    df_.loc[:, 'ticket_group'] = groups
    return df_

def get_cabin_letter(df):
    df_ = df.copy()
    cabins = parse_cabin_letter(df.Cabin)
    df_.loc[:, 'cabin_na'] = cabins.isna()
    df_.loc[:, 'cabin'] = cabins
    return df_

def get_is_alone(df):
    df_ = df.copy()
    df_.loc[:, 'is_alone'] = df.loc[:, 'Parch'] + df.loc[:, 'SibSp'] + 1
    return df_

def get_titles(df):
    df_ = df.copy()
    titles = pd.Series(map(lambda x: parse_name(x)[1], df.Name))
    df_.loc[:, 'title'] = titles
    return df_

def discretize_faries(df, ngroups=3):
    df_ = df.copy()
    df_.loc[:, 'fares'] = pd.cut(df_.loc['Fare'], ngroups, labels=False)
    return df_


In [30]:
pd.cut(combined.Fare, bins=4, labels=False).value_counts()

0.0    1241
1.0      50
2.0      13
3.0       4
Name: Fare, dtype: int64

## Building pipelines

In [None]:
from mlpipes.pfunc import *


preprocessing_pipeline = (('add_groups', get_ticket_group, kwargs={}),
                          ('add_cabins', get_cabin_letter, kwargs={}),
                          ('add_isalone', get_is_alone, kwargs={}),
                          ('add_titles', get_titles, kwargs={}),
                          ('convert_fares', discretize_faries, kwargs={'ngroups': 3})
                
                          )



    


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from IPython.core.display import display, HTML

class AbstractPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise Exception("Input data should be a DataFrame instance")
        return self

class SelectFeatures(AbstractPreprocessor):
    
    def __init__(self, k, n):
        self.k = k
        self.n = n

    def transform(self, X, y=None):
        _ = [int(x) for x in bin(self.k)[2:]]
        _ = [0] * (self.n - len(_)) + _
        return X.iloc[:, [j for j in range(self.n) if _[j]]]
  
        

class GetCategoriesAndDrop(AbstractPreprocessor):
    def __init__(self, name=None, bins=None):
        self.name = name
        self.bins = bins
    
    def transform(self, X, y=None):
        if self.bins is not None and hasattr(X, self.name):
            aux = pd.qcut(X.loc[:, self.name], q=self.bins, labels=False)
            _X = X.drop(self.name, axis=1)
            _X[self.name] = aux
            return _X
        else:
            return X
   
    
from sklearn.ensemble import RandomForestRegressor
class FillNaValues(AbstractPreprocessor):

    def __init__(self, name=None, train=None, n_features=None,
                 clf=RandomForestRegressor()):
        self.train = train
        self.name = name
        self.clf = clf
        self.n_features = n_features
    
    def transform(self, X, y=None):

        if self.name is None: 
            return X

        if X.loc[:, self.name].isnull().sum() == 0:
            return X
        
        _train = self.train.copy() if self.train is not None else X.copy()
        null_mask = _train[self.name].isnull()
        y = _train[self.name][~null_mask]
        _train = _train.drop(self.name, axis=1)
        
        n_features = int(pd.np.ceil(X.shape[1] * 0.3) or self.n_features)
        
        encoders = dict()
        for key in _train.columns.tolist():
            if not pd.np.issubdtype(_train[key].dtype, pd.np.number):
                _train.loc[_train[key].isnull(), key]  = 'N-a-N'
                le = LabelEncoder()
                _train[key] = le.fit_transform(_train[key])
                encoders[key] = le
            else:
                if any(_train[key].isnull()):
                    _train['%s_nan' % key] = 0.0
                    _train.loc[_train[key].isnull(), '%s_nan' % key] = 1.0
                    _train.loc[_train[key].isnull(), key] = _train.loc[~_train[key].isnull(), key].median()

        self.clf.fit(_train[~null_mask], y)
        
        # dropping features
        if hasattr(self.clf, 'feature_importances_'):
            # drop columns and retrain classifier
            indices = pd.np.argsort(self.clf.feature_importances_)[::-1]
            features_to_drop = _train.columns[indices].values.tolist()[n_features:]
            self.clf.fit(_train.drop(features_to_drop, axis=1)[~null_mask], y)
        else:
            features_to_drop = []
            
        _X = X.copy()
        for key in _train.columns:
            if key not in _X.columns:
                _X.loc[:, key] = 0.0
        _X = _X[_train.columns]
        for key in encoders.keys():
            if not pd.np.issubdtype(_X[key].dtype, pd.np.number):
                _X.loc[_X[key].isnull(), key]  = 'N-a-N'
                _X[key] = encoders[key].transform(_X[key])
            else:
                if any(_X[key].isnull()):
                    _X['%s_nan' % key] = 0.0
                    _X.loc[_X[key].isnull(), '%s_nan' % key] = 1.0
                    _X.loc[_X[key].isnull(), key] = X.loc[~_X[key].isnull(), key].median()
        
        na_replacements = self.clf.predict(_X.drop(features_to_drop, axis=1)[null_mask])
        result = X.copy()
        result.loc[null_mask, self.name] = na_replacements
        return result




# Preprocessing steps (feature engeneering)

In [None]:
from sklearn.pipeline import Pipeline
preprocessing_steps = [('drop_columns', DropColumns(names=['Ticket', 'PassengerId', 'Cabin', 'Survived'])),
                       ('add_title', AddTitleColumn()),
                       # ('add_first_name', AddFirstNameColumn()),
                       #('add_family_name', AddFamilyNameColumn()),
                       ('add_family_size', AddFamilySize()),
                       ('encode_sex', LabelEncodeAndDrop(name='Sex')),
                       ('fillna_embarked', FillNaSimple(name='Embarked')),
                       #('combine_embarked', CombineCategoricalValues(name='Embarked', rule={'what': ['C', 'Q'], 'to': 'Q'})),
                       ('encode_embarked', LabelEncodeAndDrop(name='Embarked')),
                       
                       # predicting nan-values
                       ('predict_fare', FillNaSimple(name='Fare')),
                       ('predict_age', FillNaSimple(name='Age')),
                    #  ('cat_ages', GetCategoriesAndDrop(name='Age', bins=[0, 0.3, 0.6, 1.0])),
                    #  ('cat_fares', GetCategoriesAndDrop(name='Fare', bins=[0, 0.3, 0.6, 1.0])),
                       
                       # combine & encode titles
                       #('combine_title_Mrs', CombineCategoricalValues(name='title', rule={'what': ['Dona', 'Mme', 'the Countess'], 'to': 'Mrs'})),
                       #('combine_title_Mr', CombineCategoricalValues(name='title', rule={'what': ['Jonkheer', 'Major', 'Col', 'Rev', 'Mr'], 'to': 'Mr1'})),
                       #('combine_title_Mr', CombineCategoricalValues(name='title', rule={'what': ['Master', 'Sir', 'Don', 'Dr'], 'to': 'Mr2'})),
                       #('combine_title_Miss', CombineCategoricalValues(name='title', rule={'what': ['Mlle','Lady','Ms'], 'to': 'Miss'})),
                       ('combine_title_rare', CombineCategoricalValues(name='title', rule={'what': ['Rev', 'Dr', 'Col', 'Ms', 'Mlle', 'Major', 'Sir', 'Mme','Lady','Capt','the Countess','Jonkheer','Don','Dona'], 'to': 'rare'})),
                       ('encode_title', LabelEncodeAndDrop(name='title')),
                       
                       #
                       #('add_is_alone', AddIsAlone()),
                       
                       # last one-hots... 
                       #('encode_fare', OneHotEncodeAndDrop(name='Fare')),
                       #('encode_age', OneHotEncodeAndDrop(name='Age')),
                       #('encode_Pclass', OneHotEncodeAndDrop(name='Pclass')),
                       #('combine_fsize', CombineCategoricalValues(name='family_size', rule={'what': ['5','6', '7','8','9','10','11'], 'to': '5'})),
                       #('encode_fsize', OneHotEncodeAndDrop(name='family_size')),
                       
                       #drop first name, family name
                       ('drop_last', DropColumns(names=['first_name','family_name', 'Name'])),
                       ('show_data', ShowDataHead())
                      ]

preprocessing_pipeline = Pipeline(steps=preprocessing_steps)
combined_processed = preprocessing_pipeline.fit_transform(combined)