In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv("../data/train.csv")

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
# Name and ticket column are not meaningful and so we remove them from the dataset
# in addition we alos remove the Cabin column which has a lot of null values
cols_to_drop = ['Name', 'Ticket', 'Cabin']
train = train.drop(cols_to_drop, axis=1)

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [31]:
# age column has a lot of missong values as well 
# but since our analysis showed that there was strong correlation between agegroup and surviving rate
# we still want to keep it for our model. So we use interpolation to fill out null values
train['Age'] = train['Age'].interpolate()

In [32]:
#drop null 'embarked' rows. Only 2 instances of this in training and 0 in test 
train.dropna(subset=['Embarked'], inplace = True)

In [33]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [34]:
#encode Sex to 0-female and 1 - male
# train['Sex'] = train.Sex.apply(lambda x: 1 if x == "female" else 0)

# def encodeEmabarked(x):
#     if x == "S":
#         return 0
#     elif x == "C":
#         return 1
#     else:
#         return 2

# train['Embarked'] = train.Embarked.apply(encodeEmabarked)

# create dummy variables from categories
embarkedDummy = pd.get_dummies(train['Embarked'])
sexDummy = pd.get_dummies(train['Sex'])
train = pd.concat((train, embarkedDummy, sexDummy), axis=1)

train.drop(['Sex', 'Embarked'], axis=1, inplace=True)

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Age          889 non-null    float64
 4   SibSp        889 non-null    int64  
 5   Parch        889 non-null    int64  
 6   Fare         889 non-null    float64
 7   C            889 non-null    uint8  
 8   Q            889 non-null    uint8  
 9   S            889 non-null    uint8  
 10  female       889 non-null    uint8  
 11  male         889 non-null    uint8  
dtypes: float64(2), int64(5), uint8(5)
memory usage: 59.9 KB


In [36]:
train.to_csv('../data/train_cleaned_1.csv', index=False)

In [37]:
df = pd.read_csv('../data/train_cleaned_1.csv', index_col="PassengerId")
df.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,female,male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,22.0,1,0,7.25,0,0,1,0,1
2,1,1,38.0,1,0,71.2833,1,0,0,1,0
3,1,3,26.0,0,0,7.925,0,0,1,1,0
4,1,1,35.0,1,0,53.1,0,0,1,1,0
5,0,3,35.0,0,0,8.05,0,0,1,0,1
