### AIM: PERFORMING FEATURE ENGINEERING ON TITANIC DATASET

In [363]:
# Importing Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [365]:
# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)
dataset=pd.read_csv('Titanic_Dataset.csv')

# Print shape of dataset with rows and columns
print(dataset.shape)

# Printing top15 records
dataset.head(15)


(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### NOTE : Observe the missing values in the dataset.
#### Age : Column 5,17,19,etc.
#### Cabin : Column 0,2,4,5,etc.
#### Embarked : Column 61,829,etc.

### 1.Handling of Missing Values 

In [332]:
#Splitting Dataset to prevent data from Leakage
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset,dataset['Survived'],test_size=0.1,random_state=0)


In [333]:
X_train.shape, X_test.shape


((801, 12), (90, 12))

### Missing Features :  Age, Cabin and Embarked 
### 1.1 For Numerical Variables

In [334]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
print("\n")
print('Number of numerical variables: {}\nThey are :'.format(len(numerical_features)))
for f in numerical_features:
    print("    ",f)
    



Number of numerical variables: 7
They are :
     PassengerId
     Survived
     Pclass
     Age
     SibSp
     Parch
     Fare


### Replacing the numerical Missing Values

In [335]:
for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
    ## create a new feature to capture nan values
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
dataset[numerical_with_nan].isnull().sum()


Age    0
dtype: int64

In [336]:
dataset.head(20)
#Observe value under "Age" which was missing earlier.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0


### 1.2 For Categorical Variables

In [337]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=='O']
print(dataset[categorical_features].head())


                                                Name     Sex  \
0                            Braund, Mr. Owen Harris    male   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2                             Heikkinen, Miss. Laina  female   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4                           Allen, Mr. William Henry    male   

             Ticket Cabin Embarked  
0         A/5 21171   NaN        S  
1          PC 17599   C85        C  
2  STON/O2. 3101282   NaN        S  
3            113803  C123        S  
4            373450   NaN        S  


In [338]:
## Let us capture all the nan values
## First lets handle Categorical features which are missing
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O']
for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))
    

Cabin: 0.771% missing values
Embarked: 0.0022% missing values


In [339]:
# Replace missing value with a new label
def replace_missing_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data
dataset=replace_missing_feature(dataset,features_nan)
dataset[features_nan].isnull().sum()


Cabin       0
Embarked    0
dtype: int64

In [340]:
dataset.head(64)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Missing,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Missing,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Missing,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9000,Missing,S,0
60,61,0,3,"Sirayanian, Mr. Orsen",male,22.0,0,0,2669,7.2292,Missing,C,0
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0000,B28,Missing,0
62,63,0,1,"Harris, Mr. Henry Birkhardt",male,45.0,1,0,36973,83.4750,C83,S,0


### 2. Handling of Outliers 

In [342]:
discrete_feature=[feature for feature in numerical_features if len(dataset[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))


Discrete Variables Count: 4


In [343]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))
# visualise the Continuous variables
dataset[continuous_feature].head()


Continuous feature Count 3


Unnamed: 0,PassengerId,Age,Fare
0,1,22.0,7.25
1,2,38.0,71.2833
2,3,26.0,7.925
3,4,35.0,53.1
4,5,35.0,8.05


#### Detecting Outliers of Variable "Age" using Percentile

In [345]:
max_threshold= dataset['Age'].quantile(0.98)
max_threshold


62.0

#### Taking data samples above 98percentile => people above age of 62.0 lies above 98percentile and can be considered as outlier

In [346]:
min_threshold = dataset['Age'].quantile(0.05)
min_threshold


6.0

#### Taking data samples below 5 percentile => people below age of 6.0 lies below 5 percentile and can be considered as outlier

# 
#  
# 
# 
### Removing Outliers

In [347]:
df_no_outliers= dataset[(dataset['Age']>min_threshold) & (dataset['Age']<max_threshold)]
df_no_outliers


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Missing,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Missing,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Missing,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Missing,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,Missing,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [348]:
print("Total Outliers Removed :",dataset.shape[0]-df_no_outliers.shape[0])

Total Outliers Removed : 66


# 
### Detecting outliers of Variable "Fare" using IQR

In [349]:
Q1 = dataset.Fare.quantile(0.25)
Q3 = dataset.Fare.quantile(0.75)
Q1, Q3


(7.9104, 31.0)

In [350]:
IQR = Q3 - Q1
IQR


23.0896

In [351]:
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit


(-26.724, 65.6344)

### Removing outliers

In [352]:
dataset[(dataset.Fare<lower_limit)&(dataset.Fare>upper_limit)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan


In [353]:
df_no_outlier1 = dataset[(dataset.Fare>lower_limit)&(dataset.Fare<upper_limit)]
df_no_outlier1


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Missing,S,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Missing,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Missing,S,0
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,Missing,Q,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Missing,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,Missing,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [354]:
print("Total Outliers Removed :",dataset.shape[0]-df_no_outlier1.shape[0])

Total Outliers Removed : 116


# 
# 
#  
#### Detecting Outliers of variable "Pclass" using Z-Score

In [367]:
dataset['zscore'] = ( dataset.Pclass - dataset.Pclass.mean() ) / dataset.Pclass.std()
dataset.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,zscore
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.826913
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,-1.565228
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.826913
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,-1.565228
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.826913


#### Observation : Z score is 0.827 (first record) of passenger 1. This means passenger1 is 0.827 standard deviation away from mean.

### So considering data points that has z score higher than 1 or lower than -1.

In [368]:
dataset[(dataset.zscore<-1) | (dataset.zscore>1)]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,zscore
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,-1.565228
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,-1.565228
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,-1.565228
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,-1.565228
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S,-1.565228
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,-1.565228
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,-1.565228
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,-1.565228
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,-1.565228


# 
#  
#### Removing outliers and producing new dataframe

In [369]:
df_no_outliers2 = dataset[(dataset.zscore>-1) & (dataset.zscore<1)]
df_no_outliers2.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,zscore
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.826913
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.826913
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.826913
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0.826913
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0.826913


In [370]:
print("Total Outliers Removed :",dataset.shape[0]-df_no_outliers2.shape[0])


Total Outliers Removed : 216


### 3. Handling Rare Categorical Feature

### We will remove categorical variables that are present less than 5% of the observations

In [359]:
for feature in categorical_features:
    temp=dataset.groupby(feature)['Survived'].count()/len(dataset)
    temp_df=temp[temp>0.05].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')
    

In [362]:
dataset.head(20)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Agenan,zscore
0,1,0,3,Rare_var,male,22.0,1,0,Rare_var,7.25,Missing,S,0,0.826913
1,2,1,1,Rare_var,female,38.0,1,0,Rare_var,71.2833,Rare_var,C,0,-1.565228
2,3,1,3,Rare_var,female,26.0,0,0,Rare_var,7.925,Missing,S,0,0.826913
3,4,1,1,Rare_var,female,35.0,1,0,Rare_var,53.1,Rare_var,S,0,-1.565228
4,5,0,3,Rare_var,male,35.0,0,0,Rare_var,8.05,Missing,S,0,0.826913
5,6,0,3,Rare_var,male,28.0,0,0,Rare_var,8.4583,Missing,Q,1,0.826913
6,7,0,1,Rare_var,male,54.0,0,0,Rare_var,51.8625,Rare_var,S,0,-1.565228
7,8,0,3,Rare_var,male,2.0,3,1,Rare_var,21.075,Missing,S,0,0.826913
8,9,1,3,Rare_var,female,27.0,0,2,Rare_var,11.1333,Missing,S,0,0.826913
9,10,1,2,Rare_var,female,14.0,1,0,Rare_var,30.0708,Missing,C,0,-0.369158


### OBSERVATIONS :
#### 1. While Handling the Missing Value : 
####     1.1 Of Numerical Variable:  In the original dataset, the value for Passenger 8,17,20,27,... etc are missing. Here we have taken     the median using outliers and are replacing the particular age value with the most suitable float.
####     1.2 Of Categorical Variable: We have changed the missing value of Cabin & Embarked by a temporary word "Missing" (it can be seen under Cabin Row 0,2,4,etc Columns && under Embarked Row Column 61)
# 
# 
#### 2. While Handling the Outliers, I observed that we can't detect Outliers for Categorical variable using Graphical Methods such as Boxplot or Scatterplot but cannot remove it. Numerical Variable's outliers can be detected by calculating Statistical Values such as StdDev, Percentile, Z-Score.
#### 3. While Handling Rare Variables, it is observed that this is prefferably calculated for categorical values having sufficient options(Cabin)  since if its distinct all the values will be rare(eg-Name,Ticket).

# 
### CONCLUSION :
#### I understood the Titanic dataset by performing Feature Engineering such as handling missing values for numerical & categorical variables,handling the outliers, dealing with rare variables and concluded the following :
#### 1. "Age" is a numerical variable and its missing values can be handled by replacing the missing value with the median using outliers.
#### 2. "Cabin" & "Embarked" are categorical variables and its missing value can be simply replaced by a temporary keyword.
#### 3. Outliers can be further classified as numerical & categorical based on its variable type.For numerical variable, we can detect the outliers by calculating Statistical Values such as StdDev, Percentile, Z-Score.
#### 4.According to this dataset,rare variables are suitable for detecting the uncommon feature a data has. 
