In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

In [2]:
titanic = pd.read_csv('../data/train.csv')

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic['Sex_category'] = titanic['Sex'].map({'female':0, 'male':1})
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [5]:
# feature
X = titanic.drop('Survived', axis=1)
#target
y = titanic['Survived']

In [6]:
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2,random_state=42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 12), (179, 12), (712,), (179,))

In [8]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S,1
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S,1
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S,1
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S,0


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   712 non-null    int64  
 1   Pclass        712 non-null    int64  
 2   Name          712 non-null    object 
 3   Sex           712 non-null    object 
 4   Age           572 non-null    float64
 5   SibSp         712 non-null    int64  
 6   Parch         712 non-null    int64  
 7   Ticket        712 non-null    object 
 8   Fare          712 non-null    float64
 9   Cabin         159 non-null    object 
 10  Embarked      710 non-null    object 
 11  Sex_category  712 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


Imputation

1. create a feature named Title

In [10]:
X_train['Title'] = X_train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].lower().strip()) 
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


In [11]:
#finding list of unique titles
X_train['Title'].value_counts()

mr              419
miss            143
mrs              96
master           33
rev               5
dr                5
major             2
col               2
mlle              2
capt              1
mme               1
ms                1
the countess      1
lady              1
Name: Title, dtype: int64

In [12]:
# 2. write a function that does the following transformations:
## ['mrs','mr','miss','master','dr','rev'] remain the same
## ['mlle','ms'] become 'miss'
## 'mme' becomes 'mrs'
## ['col','major','capt'] become 'army'
## ['don','lady','the countess','sir','the count','madam','lord'] become 'nobl'
## other titles become 'unknown'


def transform_title(title):
    if title in ['mrs','mr','miss','master','dr','rev']:
        return title
    elif title in ['mlle','ms']:
        return 'miss'
    elif title == 'mme':
        return 'mrs'
    elif title in ['col','major','capt']:
        return 'army'
    elif title in ['don','lady','the countess','sir','the count','madam','lord']:
        return 'nobl'
    else:
        return 'unknown'


In [13]:
# 3. use .apply() method for binning the title column


X_train['Title'] = X_train['Title'].apply(transform_title)
X_train['Title']


331      mr
733      mr
382      mr
704      mr
813    miss
       ... 
106    miss
270      mr
860      mr
435    miss
102      mr
Name: Title, Length: 712, dtype: object

In [14]:
X_train['Title'].value_counts()

mr        419
miss      146
mrs        97
master     33
army        5
rev         5
dr          5
nobl        2
Name: Title, dtype: int64

In [15]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


**3.3 imputation of age**

In [16]:
# hint:
# .groupby(['Pclass','Sex'])['Age'].mean()
mean_age = round(
    X_train.groupby(['Sex','Pclass'])['Age'].mean(), 1

)
mean_age

Sex     Pclass
female  1         34.9
        2         28.4
        3         21.5
male    1         40.6
        2         30.9
        3         26.6
Name: Age, dtype: float64

In [17]:
# x referes to particular column in data. (\ is line break). axis = 1 is work column wise
X_train['Age'] = X_train.apply( lambda x: 
                               mean_age[x['Sex']][x['Pclass']]
                               if x['Age']!=x['Age'] else x['Age'], axis=1
)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   712 non-null    int64  
 1   Pclass        712 non-null    int64  
 2   Name          712 non-null    object 
 3   Sex           712 non-null    object 
 4   Age           712 non-null    float64
 5   SibSp         712 non-null    int64  
 6   Parch         712 non-null    int64  
 7   Ticket        712 non-null    object 
 8   Fare          712 non-null    float64
 9   Cabin         159 non-null    object 
 10  Embarked      710 non-null    object 
 11  Sex_category  712 non-null    int64  
 12  Title         712 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 77.9+ KB


In [19]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


**3.4 imputation of embarkation**

In [20]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


In [21]:
X_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [22]:
X_train['Embarked'].info()

<class 'pandas.core.series.Series'>
Int64Index: 712 entries, 331 to 102
Series name: Embarked
Non-Null Count  Dtype 
--------------  ----- 
710 non-null    object
dtypes: object(1)
memory usage: 11.1+ KB


In [23]:
X_train['Embarked'].value_counts()

S    525
C    125
Q     60
Name: Embarked, dtype: int64

In [24]:
# hint: use most frequent class
most_frequent = X_train['Embarked'].mode()[0]
most_frequent

'S'

In [25]:
X_train['Embarked'] = X_train['Embarked'].fillna(most_frequent)
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


In [26]:
X_train['Embarked'].value_counts()

S    527
C    125
Q     60
Name: Embarked, dtype: int64

**3.5 imputation of cabin**

In [27]:
# hint: incorporate missing cabin as a class
X_train


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.0000,,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S,0,miss


In [28]:
X_train['Cabin']

331       C124
733        NaN
382        NaN
704        NaN
813        NaN
        ...   
106        NaN
270        NaN
860        NaN
435    B96 B98
102        D26
Name: Cabin, Length: 712, dtype: object

In [29]:
X_train['Cabin'].info()

<class 'pandas.core.series.Series'>
Int64Index: 712 entries, 331 to 102
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
159 non-null    object
dtypes: object(1)
memory usage: 11.1+ KB


In [30]:
X_train['Cabin'].value_counts()

C23 C25 C27    4
E101           3
B96 B98        3
C22 C26        3
G6             3
              ..
C104           1
D11            1
C86            1
C7             1
C62 C64        1
Name: Cabin, Length: 117, dtype: int64

In [31]:

frequent_cabin = X_train['Cabin'].value_counts()
X_train['Cabin'] = X_train.apply(
    lambda x: 0\
    if x['Cabin']!=x['Cabin']\
    else 1,
    axis=1
)
X_train;

In [32]:
X_train['Cabin'].value_counts()

0    553
1    159
Name: Cabin, dtype: int64

**3.6 engineer fare price**

In [33]:
# hint
# .apply(lambda x: x['Fare']/(x['SibSp']+x['Parch']+1),axis=1)

X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,1,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,0,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,0,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,0,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,0,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,0,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.0000,0,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,0,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,1,S,0,miss


In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   712 non-null    int64  
 1   Pclass        712 non-null    int64  
 2   Name          712 non-null    object 
 3   Sex           712 non-null    object 
 4   Age           712 non-null    float64
 5   SibSp         712 non-null    int64  
 6   Parch         712 non-null    int64  
 7   Ticket        712 non-null    object 
 8   Fare          712 non-null    float64
 9   Cabin         712 non-null    int64  
 10  Embarked      712 non-null    object 
 11  Sex_category  712 non-null    int64  
 12  Title         712 non-null    object 
dtypes: float64(2), int64(6), object(5)
memory usage: 77.9+ KB


In [35]:
X_train['Fare'].value_counts()

8.0500     35
13.0000    33
7.8958     32
7.7500     26
26.0000    25
           ..
40.1250     1
15.1000     1
61.1750     1
59.4000     1
14.1083     1
Name: Fare, Length: 220, dtype: int64

In [36]:
X_train['Fare'] = X_train.apply(
          lambda x: 
          x['Fare']/(x['SibSp']+x['Parch']+1),
axis=1
)
X_train;

In [37]:
X_train['Fare'].value_counts()

13.000000    46
8.050000     42
7.895800     32
7.750000     31
10.500000    21
             ..
3.625000      1
6.708350      1
6.437500      1
9.841700      1
4.702767      1
Name: Fare, Length: 249, dtype: int64

**3.7 Scaling: numerical features**

In [38]:
def standardize(series, mean, std):
    """
    returns the standardized counterpart of a series,
    given a mean and standard deviation
    """
    return (series-mean)/std

In [39]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.500000,1,S,1,mr
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.000000,0,S,1,mr
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925000,0,S,1,mr
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,3.927100,0,S,1,mr
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,4.467857,0,S,0,miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.650000,0,S,0,miss
270,271,1,"Cairns, Mr. Alexander",male,40.6,0,0,113798,31.000000,0,S,1,mr
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,4.702767,0,S,1,mr
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,30.000000,1,S,0,miss


In [40]:
numerical_features = [
    'PassengerId',
    'Pclass',
    'Sex_category',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Cabin'
]
numerical_features

['PassengerId',
 'Pclass',
 'Sex_category',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Cabin']

In [41]:
# standard scaling parameter dictionary
parameters = {}

for feature in numerical_features: 
    # populate parameter dictionary
    mean = X_train[feature].mean()
    std = X_train[feature].std()
    parameters[feature] = (mean, std)
    
    # create standadrdized numerical columns
    X_train[feature] = standardize(X_train[feature], mean, std)

In [42]:
parameters 

{'PassengerId': (448.23455056179773, 256.7314232246707),
 'Pclass': (2.330056179775281, 0.824584281529625),
 'Sex_category': (0.6558988764044944, 0.47540821857480137),
 'Age': (29.115084269662926, 13.25440539745732),
 'SibSp': (0.5533707865168539, 1.176404155529974),
 'Parch': (0.3792134831460674, 0.7916693178286206),
 'Fare': (20.131139897946152, 38.17424689129282),
 'Cabin': (0.22331460674157302, 0.4167602891072757)}

In [43]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title
331,-0.452748,-1.613002,"Partner, Mr. Austen",male,1.236186,-0.470392,-0.479005,113043,0.219228,1.863626,S,0.723801,mr
733,1.113091,-0.40027,"Berriman, Mr. William John",male,-0.461362,-0.470392,-0.479005,28425,-0.186805,-0.535835,S,0.723801,mr
382,-0.254096,0.812463,"Tikkanen, Mr. Juho",male,0.217657,-0.470392,-0.479005,STON/O 2. 3101293,-0.319748,-0.535835,S,0.723801,mr
704,1.000133,0.812463,"Hansen, Mr. Henrik Juul",male,-0.235023,0.379656,-0.479005,350025,-0.424476,-0.535835,S,0.723801,mr
813,1.424701,0.812463,"Andersson, Miss. Ebba Iris Alfrida",female,-1.743955,2.9298,2.047302,347082,-0.41031,-0.535835,S,-1.379654,miss


**3.8 Categorical Encoding**

In [44]:
pd.get_dummies(
    data=X_train['Embarked'],
    drop_first=True
)

Unnamed: 0,Q,S
331,0,1
733,0,1
382,0,1
704,0,1
813,0,1
...,...,...
106,0,1
270,0,1
860,0,1
435,0,1


In [45]:
X_train = X_train.join(
    pd.get_dummies(data=X_train['Embarked'], drop_first=True)
)
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category,Title,Q,S
331,-0.452748,-1.613002,"Partner, Mr. Austen",male,1.236186,-0.470392,-0.479005,113043,0.219228,1.863626,S,0.723801,mr,0,1
733,1.113091,-0.40027,"Berriman, Mr. William John",male,-0.461362,-0.470392,-0.479005,28425,-0.186805,-0.535835,S,0.723801,mr,0,1
382,-0.254096,0.812463,"Tikkanen, Mr. Juho",male,0.217657,-0.470392,-0.479005,STON/O 2. 3101293,-0.319748,-0.535835,S,0.723801,mr,0,1
704,1.000133,0.812463,"Hansen, Mr. Henrik Juul",male,-0.235023,0.379656,-0.479005,350025,-0.424476,-0.535835,S,0.723801,mr,0,1
813,1.424701,0.812463,"Andersson, Miss. Ebba Iris Alfrida",female,-1.743955,2.9298,2.047302,347082,-0.41031,-0.535835,S,-1.379654,miss,0,1
