# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
from IPython.display import Image

In [2]:
init_notebook_mode(connected = True)
cf.go_offline()

# Cleaning Training Data

## Importing Data

In [3]:
train = pd.read_csv('../RawData/train.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Let us first spot the missing data

In [5]:
train.isna().astype(int).iplot(kind = 'heatmap')

In [6]:
pd.DataFrame((len(train) - train.count())).iplot(kind = 'bar')

Cabin feature has a lot of missing data, we will analyze how the available information of the cabin section affects the result, if it doesn't affect VERY significantly, we will drop this feature. Same goes with Age section except it has much less missing data.

## Let us visualize the importance of the given features

### 1) Pclass :

In [7]:
train['Pclass'].unique()

array([3, 1, 2])

In [8]:
pd.DataFrame([[len(train[(train['Pclass'] == i+1) & (train['Survived'] == 0)]), len(train[(train['Pclass'] == i+1) & (train['Survived'] == 1)])] for i in range(train['Pclass'].nunique())], index = ["Pclass " + str(i+1) for i in range(train['Pclass'].nunique())], columns = ['Didn\'t survive','survived']).iplot(kind = 'bar')

#### It is obvious that Pclass has a relation with survival, hence we keep this feature.

### 2) Name

 Obviously the name of the individual cannot determine his/her fate, but this information can still be used to find potential family members using the surnames. This info can be used to determine the survival of the individual depending upon the survival of that family members as they most probably will stick together. However there might be a chance of getting a low accuracy of this analysis. I will try this variation later.

In [9]:
train.drop(['Name'], axis = 1, inplace = True)

### 3) Sex

In [10]:
pd.DataFrame([[len(train[(train['Sex'] == train['Sex'].unique()[i]) & (train['Survived'] == 0)]), len(train[(train['Sex'] == train['Sex'].unique()[i]) & (train['Survived'] == 1)])] for i in range(train['Sex'].nunique())], index = [train['Sex'].unique()[i] for i in range(train['Sex'].nunique())], columns = ['didn\'t survive','survived']).iplot(kind = 'bar')

#### As seen above, it is obvious that sex of the person has a relation with his/her survival, hence we keep this feature.

### 4) Age

In [11]:
train[['Age','Survived']].iplot(mode = 'markers', x = ['Age'])

As seen above, there is not any significant pattern in the relationship of Age and Survival, also some data of Age section is also missing as seen in the heatmap at the top. This feature may affect the accuracy of the model negatively, let us drop this feature in this variation.

In [12]:
train.drop(['Age'], axis = 1, inplace = True)

### 5) Sibsp

In [13]:
train['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [14]:
pd.DataFrame([[len(train[(train['SibSp'] == train['SibSp'].unique()[i]) & (train['Survived'] == 0)]), len(train[(train['SibSp'] == train['SibSp'].unique()[i]) & (train['Survived'] == 1)])] for i in range(train['SibSp'].nunique())], index = ["SibSp " + str(train['SibSp'].unique()[i]) for i in range(train['SibSp'].nunique())], columns = ['didn\'t survive','survived']).iplot(kind = 'bar')

Although there is very low information when SibSp > 2, we still do see some pattern difference when SibSp = 1 & Sibsp = 0. Also this feature has no missing data, so let us keep it.

### 6) Parch

In [15]:
train['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6])

In [16]:
pd.DataFrame([[len(train[(train['Parch'] == train['Parch'].unique()[i]) & (train['Survived'] == 0)]), len(train[(train['Parch'] == train['Parch'].unique()[i]) & (train['Survived'] == 1)])] for i in range(train['Parch'].nunique())], index = ["Parch " + str(train['Parch'].unique()[i]) for i in range(train['Parch'].nunique())], columns = ['didn\'t survive','survived']).iplot(kind = 'bar')

Parch feature shows similar behaviour to SibSp, we keep it too.

### 7) Ticket

In [17]:
train['Ticket'].nunique()

681

There are too many type of tickets, which are obviously in string format. It'll be difficult to find any pattern between this feature and the survival. Although it can still be used to find the missing values of the Embarked section, but since there are only 2 missing values, it isn't significant. Hence, we drop this feature

In [18]:
train.drop(['Ticket'], axis = 1, inplace = True)

### 8) Fare

In [19]:
train[['Fare','Survived']].iplot(mode = 'markers', x = ['Fare'])

Again we don't see any pattern suitable to keep the Fare feature. We drop this too.

In [20]:
train.drop(['Fare'], axis = 1, inplace = True)

### 9) Cabin

In [21]:
train['Cabin'].nunique()

147

Obviously, since there is huge missing data with this feature as seen at the top, also the fact that there are too many unique entries in this feature, it won't help our model. Hence we drop this feature.

In [22]:
train.drop(['Cabin'], axis = 1, inplace = True)

### 10) Embarked

In [23]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

Since there are only 2 missing values in the data of Embarked against 889 other, we can safely drop these 2 rows, since randomly selecting a value for these 2 rows may confuse the model as there are only 3 values in the Embarked section.

In [24]:
train.dropna(inplace = True)

In [25]:
# Fixing the indexes
train.index = [i for i in range(len(train))]

In [26]:
train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [27]:
pd.DataFrame([[len(train[(train['Embarked'] == train['Embarked'].unique()[i]) & (train['Survived'] == 0)]), len(train[(train['Embarked'] == train['Embarked'].unique()[i]) & (train['Survived'] == 1)])] for i in range(train['Embarked'].nunique())], index = ["Embarked " + str(train['Embarked'].unique()[i]) for i in range(train['Embarked'].nunique())], columns = ['didn\'t survive','survived']).iplot(kind = 'bar')

As seen above, there is some relation between where the passenger embarked and his/her survival. So we keep this feature

# Label Encoding

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()

In [30]:
train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [31]:
train['Sex'] = le.fit_transform(train['Sex'])

In [32]:
train['Sex'].unique()

array([1, 0])

In [33]:
train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [34]:
train['Embarked'] = le.fit_transform(train['Embarked'])

In [35]:
train['Embarked'].unique()

array([2, 0, 1])

# Encoding

As the features Embarked, Parch, SibSp & Pclass has more than 2 values, this may confuse the model into thinking that those are the values of those features. We need to encode these feature to {0,1} by splitting them into multiple features. This can be done with OneHotEncoding + Regularization (To remove the redundant features), OR with get_dummies method by dropping the first feature. Let us use the get_dummies method.

In [36]:
train = pd.get_dummies(train,columns = ['Pclass','Embarked','SibSp','Parch'],drop_first = True)

In [37]:
train.head()

Unnamed: 0,PassengerId,Survived,Sex,Pclass_2,Pclass_3,Embarked_1,Embarked_2,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6
0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


## Final Formatting and export

In [38]:
train_result = train['Survived']

In [39]:
train.drop(['Survived'], axis = 1, inplace = True)

In [40]:
train.to_csv('Cleaner1-train')

In [41]:
train_result.to_csv('Cleaner1-train_result',header = True)

# Cleaning Test Data