In [1]:
%load_ext autoreload
%autoreload 2


# Read Data

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('../data/titanic-train.csv')
test = pd.read_csv('../data/titanic-test.csv')

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Notes on Feature Engineering

- [x] Name Length
- [x] Has Cabin (__ or NA)
    - Convert to numeric
- [x] FamilySize
    - Sibling + Spouse + Parent + Children + 1
- IsAlone
    - FamilySize = 1
- Embarked
    - Fill NA with S (Southampton)
    - Change to numerical
- Fare
    - Fill NA with median (middle element after ascending sort)
    - Change to categorical variable
- Age
    - Change Age to integer
    - Fill NA with random from (mean +/- sd)
    - Change to categorical variable
        - Change to numerical
- Name
    - New Feature: Title (Mr., Mrs., ...)
    - Bring everything under 4 categories
        - Miss, Mr, Mrs, Rare
- Sex
    - Change to numerical
- Fare
    - Change to categorical (4)
    - Change to integer
    
---

- Drop useless columns - Cabin, pid, pname, ticket, sibsp

In [4]:
def create_new_feature(df, colname):
    return 

In [5]:
df = train

In [6]:
def create_feat_name_length(df, colname):
    return df[colname].apply(len)

df['NameLength'] = create_feat_name_length(df, 'Name')

In [7]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,21
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,28
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,40
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,21


In [8]:
df.Cabin

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [9]:
type(df.Cabin[1])

str

In [10]:
import numpy as np

def create_feat_has_cabin(df, colname):
    # if NA => 0 else 1
    def _is_nan(x):
        if isinstance(x, type(np.nan)):
            return 0
        return 1
        
    return df[colname].apply(_is_nan)
    
df['HasCabin'] = create_feat_has_cabin(df, 'Cabin')

In [11]:
def create_feat_familly_size(df):
    return df['SibSp'] + df['Parch'] + 1
    
df['FamilySize'] = create_feat_familly_size(df)

In [12]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength,HasCabin,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,23,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,1,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,22,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,44,1,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,24,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,21,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,28,1,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,40,0,4
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,21,1,1


In [13]:
def create_feat_isalone(df, colname):
    def _is_alone(x):
        if x==1:
            return 1
        return 0
    
    return df[colname].apply(_is_alone)
    
df['IsAlone'] = create_feat_isalone(df, 'FamilySize')

In [14]:
df['FamilySize']==1

0      False
1      False
2       True
3      False
4       True
       ...  
886     True
887     True
888    False
889     True
890     True
Name: FamilySize, Length: 891, dtype: bool

In [20]:
df.Fare.isna()


0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Fare, Length: 891, dtype: bool

In [16]:
def fill_na_embarked(df, colname):
    
    return df[colname].fillna('S')
    
df['Embarked'] = fill_na_embarked(df, 'Embarked')

In [24]:
def create_feat_categoricalFare(df, colname):
    return pd.qcut(df[colname], 4, labels = [0, 1, 2, 3]).astype(int)

df['CategoricalFare'] = create_feat_categoricalFare(df, 'Fare')

In [25]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength,HasCabin,FamilySize,IsAlone,CategoricalFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,23,0,2,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,1,2,0,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,22,0,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,44,1,2,0,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,24,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,21,0,1,1,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,28,1,1,1,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,40,0,4,0,2
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,21,1,1,1,2


In [26]:
df['Age'].isna().sum()

177

In [37]:
def fill_na_age(df, colname):
    mean = df['Age'].mean()
    sd = df['Age'].std()
    def fill_empty(x):
        if np.isnan(x):
            return np.random.randint(mean-sd, mean+sd, ())
        return x
    return df[colname].apply(fill_empty).astype(int)

df['Age'] = fill_na_age(df, 'Age')
        

In [42]:
def create_feat_categoricalAge(df, colname):
    return pd.qcut(df[colname], 5, labels = [0, 1, 2, 3, 4]).astype(int)

df['CategoricalAge'] = create_feat_categoricalAge(df, 'Age')

In [60]:
import re

def create_feat_title(df, colname):
    def find_title(x):
        title_search = re.search(' ([A-Za-z]+)\.', x)
        if title_search:
            title = title_search.group(1)
            if title in ['Mlle', 'Ms']:
                return 'Miss'

            elif title in ['Mme', 'Mrs']:
                return 'Mrs'
            elif title=='Mr':
                return 'Mr'           
            else:
                retur n 'Rare'
        return ""
    
    return_title= df[colname].apply(find_title)
    dict_title = {'Miss': 1, 'Mrs':2, 'Mr':3, 'Rare':4}
    return return_title.replace(dict_title)
    

df['Title'] = create_feat_title(df, 'Name')
    
    

In [61]:
df['Title'].unique()

array([3, 2, 4, 1])

In [65]:
def create_feat_sex(df, colname):
    def sex(x):
        if x=='male':
            return 1
        return 0
    
    return df[colname].apply(sex)
    
df['Sex'] = create_feat_sex(df, 'Sex')

In [66]:
df['Sex'].unique()

array([1, 0])

In [67]:
df.isna().sum()

PassengerId          0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                  0
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              687
Embarked             0
NameLength           0
HasCabin             0
FamilySize           0
IsAlone              0
CategoricalFare      0
CategoricalAge       0
Title                0
dtype: int64

In [69]:
drop_list = ['PassengerId', 'Cabin', 'Ticket', 'SibSp', 'Name']
df_clean = df.drop(drop_list, axis=1)

In [70]:
df_clean

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,NameLength,HasCabin,FamilySize,IsAlone,CategoricalFare,CategoricalAge,Title
0,0,3,1,22,0,7.2500,S,23,0,2,0,0,1,3
1,1,1,0,38,0,71.2833,C,51,1,2,0,3,3,2
2,1,3,0,26,0,7.9250,S,22,0,1,1,1,2,4
3,1,1,0,35,0,53.1000,S,44,1,2,0,3,3,2
4,0,3,1,35,0,8.0500,S,24,0,1,1,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27,0,13.0000,S,21,0,1,1,1,2,4
887,1,1,0,19,0,30.0000,S,28,1,1,1,2,0,4
888,0,3,0,28,2,23.4500,S,40,0,4,0,2,2,4
889,1,1,1,26,0,30.0000,C,21,1,1,1,2,2,3


In [71]:
df_clean.to_csv('titanic_clean_data.csv', index=False)