In [76]:
import pandas as pd

In [77]:
# Import train and test set
train = pd.read_csv('titanic-dataset/train.csv')
test = pd.read_csv('titanic-dataset/test.csv')

In [78]:
display(train.shape)
display(test.shape)

(891, 12)

(418, 11)

### NaN values

In [79]:
# Check for NaN values
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [80]:
# Use mode() function to get the most frequent value
modes = train.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [81]:
# Use fillna with most frequent values
train.fillna(modes, inplace=True)

In [82]:
# Check for NaN values
train.isna().sum().sum()

0

Get rid of 'Name', 'Ticket' abd 'Cabin' variables
**NOTE**: We don't use those values because right now it is out of scope. However, information in those variables are crucial and very imortant! The best score in Kaggle on Titanic dataset is performed only on 'Name' variable! See this notebook for more information: https://www.kaggle.com/code/cdeotte/titanic-using-name-only-0-81818/notebook
**NOTE**: It's very common in tabular data to use *categorical embeddings*. This is on *TODO* list right after finishing this notebook.

In [83]:
train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

### Label encoding

In [84]:
# Explore which variables are categorical
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,24.0,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [85]:
# Categorical varaibles are Pclass, Sex, Embarked - use one-hot encoding for gender and label for embarked
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

In [86]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(train[['Sex']])

# Merge train with encoded gender
train = train.merge(pd.DataFrame(gender.toarray(), columns=['Male', 'Female']), left_index=True, right_index=True)

# Drop unused column
train.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

# Print
train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,1,0,3,22.0,1,0,7.2500,2,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,0,1.0,0.0
2,3,1,3,26.0,0,0,7.9250,2,1.0,0.0
3,4,1,1,35.0,1,0,53.1000,2,1.0,0.0
4,5,0,3,35.0,0,0,8.0500,2,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,2,0.0,1.0
887,888,1,1,19.0,0,0,30.0000,2,1.0,0.0
888,889,0,3,24.0,1,2,23.4500,2,1.0,0.0
889,890,1,1,26.0,0,0,30.0000,0,0.0,1.0
