In [1]:
# Importing library 
import pandas as pd

In [3]:
# loading the Data 
data = pd.read_csv('titanic_train.csv')
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
# Missing values in the Data 
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# Age and Cabin have a very high number of missing values
# Embarked has very low number of missing values.

In [6]:
# Imputing Missing Values Using central tendency

In [8]:
# Finding mean value
mean_val = data['Age'].mean()
mean_val

29.69911764705882

In [9]:
# Making a copy 
data_cleaned = data.copy()

# Imputing missing values
data_cleaned['Age'] = data['Age'].fillna(value = mean_val)
data_cleaned['Age'].isnull().sum()

0

In [10]:
data['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
mode_val = data['Embarked'].mode()[0]
mode_val

'S'

In [12]:
data_cleaned['Embarked'] = data['Embarked'].fillna(value = mode_val)

In [13]:
# Dealing with Categorical Variable 

In [14]:
# Categorical variables in the data
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [15]:
categorical_cols = ['Name','Sex','Ticket','Cabin','Embarked']

In [16]:
# Number of unique values
data[categorical_cols].nunique()

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [17]:
# Can One hot Encode Sex and Embarked
# Deal with them differently (extract features)
# Name, Ticket and Cabin (when encoded) will have zeros

In [18]:
# One-hot Encoding

In [19]:
pd.get_dummies(data['Embarked']).head()

Unnamed: 0,C,Q,S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True


In [20]:
data_cleaned = data_cleaned.drop(['Name','Ticket','Cabin'], axis=1)

In [21]:
data_cleaned = data_cleaned = pd.get_dummies(data_cleaned)
data_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,False,True,False,False,True
1,2,1,1,38.0,1,0,71.2833,True,False,True,False,False
2,3,1,3,26.0,0,0,7.925,True,False,False,False,True
3,4,1,1,35.0,1,0,53.1,True,False,False,False,True
4,5,0,3,35.0,0,0,8.05,False,True,False,False,True


In [22]:
# SibSp and Parch hold discreate values
# We can covnvert them into separate columns as well 

In [23]:
# Label Encoding

In [24]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [25]:
# Map function
data['Embarked'].map({'Q':0, 'S':1, 'C':2})

0      1.0
1      2.0
2      1.0
3      1.0
4      1.0
      ... 
886    1.0
887    1.0
888    1.0
889    2.0
890    0.0
Name: Embarked, Length: 891, dtype: float64

In [26]:
data['Embarked'] = data['Embarked'].map({'Q':0, 'S':1, 'C':2})
data['Embarked'].head()

0    1.0
1    2.0
2    1.0
3    1.0
4    1.0
Name: Embarked, dtype: float64