# Pipelining with Titanic Data

### Data loading ...

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [None]:
train.head()

In [None]:
test.head()

#### Some preprocessing of the data

It is likely that `PassengerId` or `Ticket` number don't affect on passenger survillance. So, lets drop these columns at the beggining.

In [None]:
combined = pd.concat([train, test]).reset_index(drop=True)

In [None]:
combined.shape


## Utility functions and classes

In [None]:
# self explanatory function, returns first name, title and the last name of a passenger
def parse_name(s): 
    a, b = s.split(',')
    family_name = a.strip()
    title = b.split('.')[0].strip()
    first_name = b.split('.')[1].split()[0].strip()
    return (first_name, title, family_name)


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from IPython.core.display import display, HTML

class AbstractPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise Exception("Input data should be a DataFrame instance")
        return self


class DropColumns(AbstractPreprocessor):
    '''
    Drops specified columns from a DataFrame.
    
    Input data assumed to be a DataFrame.
    
    Usage
    -----
        dropper = DropColumns(names=['PassengerId', 'Ticket'])
        dropped_df = dropper.fit_transform(source_dataframe)
    '''
    
    def __init__(self, names=[]):
        self.names = names
    
    def transform(self, X, y=None):
        filtered = [col for col in self.names if col in X.columns]
        return X.drop(filtered, axis=1)

class AddFirstNameColumn(AbstractPreprocessor):

    def transform(self, X, y=None):
        result = X.copy()
        result['first_name'] = X.loc[:, 'Name'].apply(lambda x: parse_name(x)[0].replace('(', '').replace(')', ''))
        return result

class AddFamilyNameColumn(AbstractPreprocessor):
    def transform(self, X, y=None):
        result = X.copy()
        result['family_name'] = X.loc[:, 'Name'].apply(lambda x: parse_name(x)[-1])
        return result
                
class AddTitleColumn(AbstractPreprocessor):

    def transform(self, X, y=None):
        result = X.copy()
        result['title'] = X.loc[:, 'Name'].apply(lambda x: parse_name(x)[1])
        return result   

class AddFamilySize(AbstractPreprocessor):
    def transform(self, X, y=None):
        _X = X.copy()
        _X['family_size'] = X['SibSp'] + X['Parch'] + 1
        return _X

class AddIsAlone(AbstractPreprocessor):
    def transform(self, X, y=None):
        _X = X.copy()
        _X['is_alone'] = (_X.family_size == 1).astype(int)
        return _X

class OneHotEncodeAndDrop(AbstractPreprocessor):
    
    def __init__(self, name=None):
        self.name = name
    
    def transform(self, X, y=None):
        aux = pd.get_dummies(X.loc[:, self.name], prefix=self.name)
        return pd.concat([X.drop(self.name, axis=1), aux], axis=1)

class LabelEncodeAndDrop(AbstractPreprocessor):
    
    def __init__(self, name=None):
        self.name = name
        self.enc = LabelEncoder()
    
    def fit(self, X, y=None):
        self.enc.fit(X.loc[:, self.name].values)
        return self
        
    def transform(self, X, y=None):
        _X = X.copy()
        new_values = self.enc.transform(_X.loc[:, self.name].values)
        _X.loc[:, self.name] = new_values
        return _X

    
class CombineCategoricalValues(AbstractPreprocessor):
    def __init__(self, name=None, rule={'what': [], 'to': None}):
        self.name = name
        self.rule = rule
        
    def transform(self, X, y=None):
        mask = X.loc[:, self.name].isin(self.rule['what'])
        _X = X.copy()
        _X.loc[:, self.name][mask] = self.rule['to']
        return _X
        
class DropByValue(AbstractPreprocessor):
    def __init__(self, name=None, value=None):
        self.name = name
        self.value = value
    
    def transform(self, X, y=None):
        return X[X.loc[:, self.name] != self.value]
    
class GetCategoriesAndDrop(AbstractPreprocessor):
    def __init__(self, name=None, bins=None):
        self.name = name
        self.bins = bins
    
    def transform(self, X, y=None):
        aux = pd.cut(X.loc[:, self.name], bins=self.bins, labels=False)
        return X.drop(self.name, axis=1).join(pd.get_dummies(aux, prefix=self.name))

    
    
from sklearn.ensemble import RandomForestRegressor
class FillNaValues(AbstractPreprocessor):

    def __init__(self, name=None, train=None, n_features=None,
                 clf=RandomForestRegressor()):
        self.train = train
        self.name = name
        self.clf = clf
        self.n_features = n_features
    
    def transform(self, X, y=None):

        if self.name is None: 
            return X

        if X.loc[:, self.name].isnull().sum() == 0:
            return X
        
        _train = self.train.copy() if self.train is not None else X.copy()
        null_mask = _train[self.name].isnull()
        y = _train[self.name][~null_mask]
        _train = _train.drop(self.name, axis=1)
        
        n_features = int(pd.np.ceil(X.shape[1] * 0.3) or self.n_features)
        
        encoders = dict()
        for key in _train.columns.tolist():
            if not pd.np.issubdtype(_train[key].dtype, pd.np.number):
                _train.loc[_train[key].isnull(), key]  = 'N-a-N'
                le = LabelEncoder()
                _train[key] = le.fit_transform(_train[key])
                encoders[key] = le
            else:
                if any(_train[key].isnull()):
                    _train['%s_nan' % key] = 0.0
                    _train.loc[_train[key].isnull(), '%s_nan' % key] = 1.0
                    _train.loc[_train[key].isnull(), key] = -9999.0

        self.clf.fit(_train[~null_mask], y)
        
        # dropping features
        if hasattr(self.clf, 'feature_importances_'):
            # drop columns and retrain classifier
            indices = pd.np.argsort(self.clf.feature_importances_)[::-1]
            features_to_drop = _train.columns[indices].values.tolist()[n_features:]
            self.clf.fit(_train.drop(features_to_drop, axis=1)[~null_mask], y)
        else:
            features_to_drop = []
            
        _X = X.copy()
        for key in _train.columns:
            if key not in _X.columns:
                _X.loc[:, key] = 0.0
        _X = _X[_train.columns]
        for key in encoders.keys():
            if not pd.np.issubdtype(_X[key].dtype, pd.np.number):
                _X.loc[_X[key].isnull(), key]  = 'N-a-N'
                _X[key] = encoders[key].transform(_X[key])
            else:
                if any(_X[key].isnull()):
                    _X['%s_nan' % key] = 0.0
                    _X.loc[_X[key].isnull(), '%s_nan' % key] = 1.0
                    _X.loc[_X[key].isnull(), key] = -999.0
        
        na_replacements = self.clf.predict(_X.drop(features_to_drop, axis=1)[null_mask])
        result = X.copy()
        result.loc[null_mask, self.name] = na_replacements
        return result

class FillNaEmbarkedSimple(AbstractPreprocessor):
    
    def transform(self, X, y=None):
        _X = X.copy()
        _X['Embarked'] = _X['Embarked'].fillna(_X.Embarked.dropna().mode()[0])
        return _X
    
class DropRows(AbstractPreprocessor):

    def __init__(self, condition=None):
        self.condition = condition #condition depends on (X, y) and 
        # returns boolean array of the same length as X

    def fit(self, X, y=None):
        return self
    
    def trasform(self, X, y=None):
        if self.condition is not None:
            return X[self.condition(X, y)]
        else:
            return X


class ShowDataHead(AbstractPreprocessor):
    def fit(self, X, y=None):
        display(HTML(X.head().to_html()))
        return self
        
    def transform(self, X, y=None):
        return X

# Preprocessing steps (feature engeneering)

In [None]:
from sklearn.pipeline import Pipeline
preprocessing_steps = [('drop_columns', DropColumns(names=['Ticket', 'PassengerId', 'Cabin', 'Survived'])),
                       ('add_title', AddTitleColumn()),
                       ('add_first_name', AddFirstNameColumn()),
                       ('add_family_name', AddFamilyNameColumn()),
                       ('add_family_size', AddFamilySize()),
                       ('drop_name', DropColumns(names=['Name'])),
                       ('encode_sex', LabelEncodeAndDrop(name='Sex')),
                       ('fillna_embarked', FillNaEmbarkedSimple()),
                       ('encode_embarked', LabelEncodeAndDrop(name='Embarked')),
                       
                       # predicting nan-values
                       ('predict_fare', FillNaValues(name='Fare')),
                       ('predict_age', FillNaValues(name='Age')),
                     
                       # combine & encode titles
                       ('combine_title_Mrs', CombineCategoricalValues(name='title', rule={'what': ['Dona', 'Lady', 'Mme', 'the Countess'], 'to': 'Mrs'})),
                       ('combine_title_Mr', CombineCategoricalValues(name='title', rule={'what': ['Master', 'Jonkheer', 'Major', 'Col', 'Sir', 'Rev', 'Don', 'Capt', 'Dr'], 'to': 'Mr'})),
                       ('combine_title_Miss', CombineCategoricalValues(name='title', rule={'what': ['Mlle', 'Ms'], 'to': 'Miss'})),
                       ('encode_title', LabelEncodeAndDrop(name='title')),
                       
                       #
                       ('add_is_alone', AddIsAlone()),
                       
                       #drop first name, family name
                       ('drop_last', DropColumns(names=['first_name','family_name', 'SibSp', 'Parch'])),
                      ]

preprocessing_pipeline = Pipeline(steps=preprocessing_steps)
combined_processed = preprocessing_pipeline.fit_transform(combined)

# It seems that all is ready. Lets make a classifier!

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV


In [None]:
train_processed = combined_processed.iloc[:train.shape[0]]
test_processed = combined_processed.iloc[train.shape[0]:]
y = train.Survived.values

In [None]:
clf = LogisticRegression(penalty='l1',tol=1.e-7, C=0.1)

In [None]:
cross_val_score(clf, train_processed.values, y, cv=20).mean()

In [None]:
classifiers = [{'clf': LogisticRegression(), 'params': {'penalty':('l1', 'l2'), 'C': pd.np.linspace(1.e-6, 3, 20)}}, 
               {'clf': SVC(), 'params': {'kernel':('rbf', 'linear'), 'C': [1, 10, 100, 1000]}},
               {'clf': RandomForestClassifier(), 'params':{'n_estimators':[10, 50, 100, 500], 'max_depth': [2, 3, 5, 7, 11], 
                                                          'max_features': ['auto', 'log2', 'sqrt']}},
               {'clf': KNeighborsClassifier(), 'params': {'n_neighbors': [1, 3, 5, 7], 'p': [1, 2]}},
               {'clf': GaussianNB(), 'params': {'priors': [[0.7, 0.3], [0.8, 0.2]]}},
               {'clf': AdaBoostClassifier(), 'params': {}},
               
              ]
results = dict()
for c in classifiers:
    print("Performing grid searching for ", c['clf'])
    clf = GridSearchCV(c['clf'], c['params'], cv=10, scoring='accuracy', verbose=True, n_jobs=-1)
    clf.fit(train_processed, y)
    results[c['clf']] = clf
    print("Best score is ", clf.best_score_)
    