# 0 Packages and Data Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, quantile_transform , minmax_scale
import scipy.stats as stats
from scipy.stats import norm

pd.set_option('display.max_columns', 500)

train_original=pd.read_csv('train.csv')
train_original.set_index('PassengerId',inplace=True)
train_full= pd.read_csv('train.csv')
train_full.set_index('PassengerId',inplace=True)

quantitative = [f for f in train_full.columns if train_full.dtypes[f] != 'object']
quantitative.remove('Survived')
qualitative = [f for f in train_full.columns if train_full.dtypes[f] == 'object']

# Pipeline

Since data 3 needs the full list of names and tickets, we will focus only on data 1 and 2 for now, so that we do not have to do further engineering to prevent mistmatching number of features between train and test

## missing values

In [2]:
#filling in missing values
def clean_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):

        if Pclass == 1:
            return 38

        elif Pclass == 2:
            return 30

        else:
            return 25

    else:
        return Age

def clean_fare(cols):
    Fare = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Fare):

        if Pclass == 1:
            return 84.15

        elif Pclass == 2:
            return 20.66

        else:
            return 13.68

    else:
        return Fare
    
def set_age(data_set):
    data_set['Age']=data_set[['Age','Pclass']].apply(clean_age,axis=1)
    
def set_fare(data_set):
    data_set['Fare']=data_set[['Fare','Pclass']].apply(clean_age,axis=1)
    
def add_deck(data_set):
    data_set['Deck']=data_set['Cabin'].apply(lambda x: x[0]+'_deck' if not pd.isnull(x) else 'Missing_Deck')

def set_embark(data_set):
    data_set['Embarked']=data_set['Embarked'].apply(lambda x: x if not pd.isnull(x) else 'Missing_Embark')

## feature engineering

In [3]:
# feature engineering
def get_fare_and_family(data_set):
    #Check if fare was paid (crew members would not have paid)
    paid_fare=data_set['Fare'].apply(lambda x: 1 if x>0 else 0)
    #Check if has family on board
    has_family=(data_set['SibSp']>0) | (data_set['Parch']>0)
    has_family=has_family.apply(lambda x: 1 if x else 0)
    #we count a one-person family a family, this avoids dividing by 0
    family_size= has_family*(data_set['SibSp']+data_set['Parch'])+1
    #compute fare per person
    fare_per_fam_member=data_set['Fare']/family_size
    fare_and_family= pd.DataFrame(dict(paid_fare=paid_fare,has_family=has_family,family_size=family_size,fare_per_fam_member=fare_per_fam_member),index=data_set.index)
    return fare_and_family
    
def get_duplicated_surnames_tickets_cabins(data_set):
    #Check and identifies duplicated surnames, tickets, and cabin
    duplicated_surnames_tickets_cabins=data_set[['Name','Ticket','Cabin']]
    #find all duplicated/non-unique
    duplicated_surnames_tickets_cabins['DuplicatedName']=duplicated_surnames_tickets_cabins['Name'].apply(lambda name: name.split(',',1)[0].split()[0]).duplicated(keep=False)
    duplicated_surnames_tickets_cabins['DuplicatedTicket']=duplicated_surnames_tickets_cabins['Ticket'].duplicated(keep=False)
    duplicated_surnames_tickets_cabins['DuplicatedCabin']=duplicated_surnames_tickets_cabins['Cabin'].duplicated(keep=False)
    #find all duplicated/non-unique
    duplicated_surnames_tickets_cabins['DuplicatedName']=duplicated_surnames_tickets_cabins['Name'].apply(lambda name: name.split(',',1)[0].split()[0]).duplicated(keep=False)
    duplicated_surnames_tickets_cabins['DuplicatedTicket']=duplicated_surnames_tickets_cabins['Ticket'].duplicated(keep=False)
    duplicated_surnames_tickets_cabins['DuplicatedCabin']=duplicated_surnames_tickets_cabins['Cabin'].duplicated(keep=False)
    #turn trues and falses to 1s and 0s
    duplicated_surnames_tickets_cabins['DuplicatedName']=duplicated_surnames_tickets_cabins['DuplicatedName'].apply(lambda x: 1 if x else 0)
    duplicated_surnames_tickets_cabins['DuplicatedTicket']=duplicated_surnames_tickets_cabins['DuplicatedTicket'].apply(lambda x: 1 if x else 0)
    duplicated_surnames_tickets_cabins['DuplicatedCabin']=duplicated_surnames_tickets_cabins['DuplicatedCabin'].apply(lambda x: 1 if x else 0)
    #keep only the duplicatd surnames to one-hot them later
    duplicated_surnames_tickets_cabins['Surname']='Unique_Surname'
    is_duplicated = (duplicated_surnames_tickets_cabins['DuplicatedName']==1)
    names=train_original['Name']
    surnames=names.apply(lambda name: name.split(',',1)[0].split()[0] if name.split(',',1)[0].split()[0]!='van' else name.split(',',1)[0].split()[1])
    duplicated_surnames_tickets_cabins.loc[is_duplicated,'Surname'] = surnames
    #keep only the duplicated tickets to one-hot them later
    duplicated_surnames_tickets_cabins['Tickets']='Unique_Ticket'
    is_duplicated = (duplicated_surnames_tickets_cabins['DuplicatedTicket']==1)
    tickets=train_original['Ticket']
    duplicated_surnames_tickets_cabins.loc[is_duplicated,'Tickets'] = tickets
    #keep only the duplicated cabins to one-hot them later
    duplicated_surnames_tickets_cabins['Cabin']='Unique_Cabin'
    is_duplicated = (duplicated_surnames_tickets_cabins['DuplicatedCabin']==1)
    cabins=train_original['Cabin']
    duplicated_surnames_tickets_cabins.loc[is_duplicated,'Cabin'] = cabins
    return duplicated_surnames_tickets_cabins

def get_titles(data_set):
    temp_df = pd.DataFrame()
    temp_df['Title'] = data_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    temp_df['Title'] = temp_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    temp_df['Title'] = temp_df['Title'].replace('Mlle', 'Miss')
    temp_df['Title'] = temp_df['Title'].replace('Ms', 'Miss')
    temp_df['Title'] = temp_df['Title'].replace('Mme', 'Mrs')
    titles = temp_df['Title']
    return titles

def get_dummies(data_set):
    duplicated_surnames_tickets_cabins=get_duplicated_surnames_tickets_cabins(data_set)
    titles = get_titles(data_set)
    #Dummies for sex, emark, titles, and the new created features
    sex_dummies = pd.get_dummies(data_set['Sex'],drop_first=True)
    embark_dummies = pd.get_dummies(data_set['Embarked'].apply(lambda x: x if not pd.isnull(x) else 'Missing_Embark'),drop_first=False)#We have missing embarked info, dropping first will overlap these with non-missing
    cabin_dummies = pd.get_dummies(data_set['Cabin'],drop_first=False)#We have missing cabin info, dropping first will overlap these with non-missing
    deck_dummies = pd.get_dummies(data_set['Cabin'].apply(lambda x: x[0]+'_deck' if not pd.isnull(x) else 'Missing_Deck'),drop_first=False)#We are not dropping Deck as we will try to predict these
    titles_dummies = pd.get_dummies(titles,drop_first=True)
    surname_dummies=pd.get_dummies(duplicated_surnames_tickets_cabins['Surname'],drop_first=True)
    tickets_dummies=pd.get_dummies(duplicated_surnames_tickets_cabins['Tickets'],drop_first=True)
    return sex_dummies, embark_dummies, cabin_dummies, deck_dummies, titles_dummies, surname_dummies,tickets_dummies

## Feature Scaling

In [4]:
# get scaled and transformed features
def logtransform(data):
    return np.log10(data+1)#+1 avoids issues with vanishing entries by shifting the log function to the left by 1
def get_scalled_features (data_set):
    fare_and_family = get_fare_and_family(data_set)
    working_df=pd.concat([data_set,fare_and_family],axis=1)
    #standartise age
    temp=quantile_transform(working_df['Age'].values.reshape(-1,1),output_distribution='normal',n_quantiles=25,copy=True)
    agestandard=pd.DataFrame(temp,index=working_df.index,columns=['AgeStandard'],copy=True)
    #standartise fare
    temp=quantile_transform(working_df['Fare'].values.reshape(-1,1),output_distribution='normal',n_quantiles=25,copy=True)
    farestandard=pd.DataFrame(temp,index=working_df.index,columns=['FareStandard'],copy=True)
    #standartise fare per family size
    temp=quantile_transform(working_df['fare_per_fam_member'].values.reshape(-1,1),output_distribution='normal',n_quantiles=25,copy=True)
    fare_fam_size_standard=pd.DataFrame(temp,index=working_df.index,columns=['fare_per_fam_member_Standard'],copy=True)
    sibsp_scalled=working_df['SibSp'].apply(logtransform)
    parch_scalled=working_df['Parch'].apply(logtransform)
    famsize_scalled=working_df['family_size'].apply(np.log)
    scalled_features = pd.concat([agestandard,farestandard,fare_fam_size_standard,sibsp_scalled,parch_scalled,famsize_scalled],axis=1)
    return scalled_features

## Generate data

In [5]:
train_original = pd.read_csv('train.csv')
train_original.set_index('PassengerId',inplace=True)
test_original = pd.read_csv('test.csv')
test_original.set_index('PassengerId',inplace=True)

In [9]:
def transform_data(data_set,data_type):
    working_df=data_set
    set_age(data_set)
    set_fare(data_set)
    add_deck(data_set)
    set_embark(data_set)
    scalled_features = get_scalled_features(data_set)
    fare_and_family=get_fare_and_family(working_df)
    duplicated_surnames_tickets_cabins=get_duplicated_surnames_tickets_cabins(working_df)
    sex_dummies, embark_dummies, _, deck_dummies, titles_dummies, _,_ = get_dummies(working_df)
    if(data_type=='1'):
        data_transformed=pd.concat([data_set.drop(['Age','Fare','SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked','Deck'],axis=1),scalled_features.drop(['fare_per_fam_member_Standard','family_size'],axis=1),sex_dummies,embark_dummies,deck_dummies],axis=1)
    if(data_type=='2'):
        data_transformed=pd.concat([working_df.drop(['Age','Fare','SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked','Deck'],axis=1),scalled_features,fare_and_family['paid_fare'],sex_dummies,embark_dummies,deck_dummies,duplicated_surnames_tickets_cabins.drop(["Name","Ticket","Cabin","Surname","Tickets"],axis=1),titles_dummies],axis=1)
    return data_transformed

In [12]:
data_to_prepare = ['1','2']

for types in data_to_prepare:
    train_working= train_original.copy()
    test_working = test_original.copy()
    test_working['Survived']=2
    full_working = pd.concat([train_working,test_working])
    full_transformed=transform_data(full_working,types)
    train_transformed=full_transformed[full_transformed['Survived']<2]
    test_transformed=full_transformed[full_transformed['Survived']==2].drop('Survived',axis=1)
    full_transformed.to_csv("./data_raw_"+types+".csv")
    train_transformed.to_csv("./data_train_"+types+".csv")
    test_transformed.to_csv("./data_test_"+types+".csv")



### data of type 1 (no new features)

In [16]:
train_working= train_original.copy()
test_working = test_original.copy()
test_working['Survived']=2

full_working = pd.concat([train_working,test_working])

full_working_1=get_data_type_1(full_working)

train_1=full_working_1[full_working_1['Survived']<2]
test_1=full_working_1[full_working_1['Survived']==2].drop('Survived',axis=1)

full_working_1.to_csv("./data_raw_1.csv")
train_1.to_csv("./data_train_1.csv")
test_1.to_csv("./data_test_1.csv")

### data of type 3 (new features and titles)

In [20]:
train_working= train_original.copy()
test_working = test_original.copy()
test_working['Survived']=2

full_working = pd.concat([train_working,test_working])

full_working_3=get_data_type_3(full_working)

train_3=full_working_3[full_working_3['Survived']<2]
test_3=full_working_3[full_working_3['Survived']==2].drop('Survived',axis=1)

full_working_3.to_csv("./data_raw_3.csv")
train_3.to_csv("./data_train_3.csv")
test_3.to_csv("./data_test_3.csv")