### Read data and import relevant python libraries


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

titanic = pd.read_csv('../Data/train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Clean continuous variables

First check for missing values in the dataset. We will fill in the missing data or drop these rows

In [4]:
titanic.isnull().sum()
# Fill the missing ages with the average
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Drop unnecessary columns

We will be dropping passengerId since it doesn't have any affect on the output

In [6]:
titanic.drop(['PassengerId'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Clean categorical variables

First we'll check the number of rows that have null values

In [7]:
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

We see that there seems to be quite a number of null values in the Cabin columns. Let's see if the cabin being null has an effect on the output


In [8]:
titanic.groupby(titanic['Cabin'].isnull())['Survived'].mean()

Cabin
False    0.666667
True     0.299854
Name: Survived, dtype: float64

It seems that there is a strong correlation between whether a passenger survived and them not having a cabin. Let's add a cabin index to indicate whether they had a cabin or not

In [15]:
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_ind
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0


While we're at it. Let's convert sex to numeric as well

In [11]:
gender_num = {'male':0, 'female':1}

titanic['Sex'] = titanic['Sex'].map(gender_num)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin ind
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0


Let's have a look at embarked

In [19]:
titanic.groupby(titanic['Embarked'])['Survived'].mean()
titanic['Emb_C'] = np.where(titanic['Embarked'] == 'C', 1, 0)
titanic['Emb_S'] = np.where(titanic['Embarked'] == 'S', 1, 0)
titanic['Emb_Q'] = np.where(titanic['Embarked'] == 'Q', 1, 0)
titanic.drop(['Embarked'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Cabin_ind,Emb_C,Emb_S,Emb_Q
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,1,0,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,0,1,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,1,0


### Drop unnecessary variables

To finish off the cleaning we'll drop unnecessary variables such cabin, embarked, name, and ticket. We've already extracted the valuable information from cabin and embarked. The name and ticket columns will not have any effect on the output so we'll drop that too

In [21]:
titanic.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_ind,Emb_C,Emb_S,Emb_Q
0,0,3,0,22.0,1,0,7.25,0,0,1,0
1,1,1,1,38.0,1,0,71.2833,1,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1,0
3,1,1,1,35.0,1,0,53.1,1,0,1,0
4,0,3,0,35.0,0,0,8.05,0,0,1,0


Now to save the cleaned data

In [23]:
titanic.to_csv('../Data/trained_cleaned.csv', index=False)

### Splitting the data in train and validation set

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

titanic = pd.read_csv('../Data/trained_cleaned.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_ind,Emb_C,Emb_S,Emb_Q
0,0,3,0,22.0,1,0,7.25,0,0,1,0
1,1,1,1,38.0,1,0,71.2833,1,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1,0
3,1,1,1,35.0,1,0,53.1,1,0,1,0
4,0,3,0,35.0,0,0,8.05,0,0,1,0


In [4]:
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

for dataset in [y_train, y_val]:
    print(round( len(dataset) / len(labels), 2))

0.8
0.2


In [6]:
X_train.to_csv('../Data/train_features.csv', index=False)
X_val.to_csv('../Data/val_features.csv', index=False)

y_train.to_csv('../Data/train_labels.csv', index=False)
y_val.to_csv('../Data/val_labels.csv', index=False)