# Feature Extraction

In [1]:
# imports
import pandas as pd
import numpy as np
import os

### Import Data

In [2]:
raw_data_path = os.path.join(os.path.pardir,'data','raw')
data_file_path = os.path.join(raw_data_path, 'train.csv')
print(data_file_path)

..\data\raw\train.csv


In [3]:
df = pd.read_csv(data_file_path)

## Feature Extraction

### Missing Data

In [4]:
#Checking for missing data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
#Missing values in Age
from sklearn.preprocessing import Imputer
age_imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
age_imputer = age_imputer.fit(df.iloc[ : , [5]])
df.iloc[:, [5]] = age_imputer.transform(df.iloc[:, [5]])
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [6]:
#Missing values in Embarked
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
df['Embarked'] = df['Embarked'].fillna('S')

In [8]:
#Check missing data again
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

### Categorical Data

In [9]:

from sklearn.preprocessing import LabelEncoder
labelencoder_sex = LabelEncoder()
df.iloc[:, [4]] = labelencoder_sex.fit_transform(df.iloc[:, [4]])
print(df.head())  #Trivia : Why did we not use OneHotEncoder?

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        S  
1          PC 17599  71.2833   C85        C  
2  STON/O2. 3101282   7.9250   NaN        S  
3            113803  53.1000  C123        S  
4            373450   8.0500   NaN        S  


  y = column_or_1d(y, warn=True)


In [10]:

#Encoding Embarked
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_embarked = LabelEncoder()
df.iloc[:, [11]] = labelencoder_embarked.fit_transform(df.iloc[:, [11]])
print(df.head())

df.Embarked.value_counts()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin  Embarked  
0         A/5 21171   7.2500   NaN         2  
1          PC 17599  71.2833   C85         0  
2  STON/O2. 3101282   7.9250   NaN         2  
3            113803  53.1000  C123         2  
4            373450   8.0500   NaN         2  


  y = column_or_1d(y, warn=True)


2    646
0    168
1     77
Name: Embarked, dtype: int64

In [11]:
# Encoding Pclass and Embarked
df = pd.get_dummies(df, columns=['Pclass', 'Embarked'])
df.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,0,0,1,0,0,1


### Creating new features

In [12]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
print(df.head())

   PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr. Owen Harris   
1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3         1                             Heikkinen, Miss. Laina   
3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5         0                           Allen, Mr. William Henry   

   Sex   Age  SibSp  Parch            Ticket     Fare Cabin  Pclass_1  \
0    1  22.0      1      0         A/5 21171   7.2500   NaN         0   
1    0  38.0      1      0          PC 17599  71.2833   C85         1   
2    0  26.0      0      0  STON/O2. 3101282   7.9250   NaN         0   
3    0  35.0      1      0            113803  53.1000  C123         1   
4    1  35.0      0      0            373450   8.0500   NaN         0   

   Pclass_2  Pclass_3  Embarked_0  Embarked_1  Embarked_2  FamilySize  
0         0   

## Feature Selection

In [13]:
irrelevant_features = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']
df = df.drop(irrelevant_features, axis=1)
print(df.head())

   Survived  Sex   Age     Fare  Pclass_1  Pclass_2  Pclass_3  Embarked_0  \
0         0    1  22.0   7.2500         0         0         1           0   
1         1    0  38.0  71.2833         1         0         0           1   
2         1    0  26.0   7.9250         0         0         1           0   
3         1    0  35.0  53.1000         1         0         0           0   
4         0    1  35.0   8.0500         0         0         1           0   

   Embarked_1  Embarked_2  FamilySize  
0           0           1           2  
1           0           0           2  
2           0           1           1  
3           0           1           2  
4           0           1           1  


### Saving prepared data to a new CSV

In [14]:
df.to_csv('../data/raw/titanic_prepared_data.csv', index=False)