In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
file_folder = '~/Data/Kaggle/Titanic/'

In [3]:
train_file = file_folder + 'train.csv'

In [4]:
df = pd.read_csv(train_file)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.shape

(891, 12)

### Check for Null values

In [7]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:
df['Pclass'].isna().sum()

0

In [9]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
df['Sex'].isna().sum()

0

In [11]:
df['Age'].isna().sum()

177

In [12]:
mean_age_value = df['Age'].mean()
mean_age_value

29.69911764705882

In [13]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [14]:
df['SibSp'].isna().sum()

0

In [15]:
df['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [16]:
df['Parch'].isna().sum()

0

In [17]:
df['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [18]:
df['FamilySize'] = df['SibSp'] + df['Parch']

In [19]:
df['Fare'].isna().sum()

0

In [20]:
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [21]:
mean_fare_value = 32.20
mean_fare_value

32.2

In [22]:
df['Cabin'].isna().sum()

687

In [23]:
df['Cabin'].value_counts()

B96 B98        4
C23 C25 C27    4
G6             4
F33            3
C22 C26        3
              ..
D49            1
E68            1
D56            1
B50            1
B80            1
Name: Cabin, Length: 147, dtype: int64

In [24]:
df['Embarked'].isna().sum()

2

In [25]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [26]:
df['Embarked'] = df['Embarked'].fillna('S')

### Encode categorical columns

In [27]:
cat_cols = ['Pclass', 'Sex', 'Embarked']

In [28]:
for col in cat_cols:
    tmp = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, tmp], axis=1)

In [29]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,FamilySize,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,1,0,0,1,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,1,1,0,0,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,0,0,0,1,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,1,1,0,0,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,0,0,0,1,0,1,0,0,1


### Drop columns

In [30]:
drop_cols = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [31]:
df = df.drop(drop_cols, axis=1)
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,FamilySize,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,1,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.925,0,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.05,0,0,0,1,0,1,0,0,1


In [32]:
df.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [33]:
Column_order = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [34]:
output_file = file_folder + 'train_processed.csv'
df.to_csv(output_file, index=False)