In [120]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


**Loading the Dataset**

In [121]:
data = pd.read_csv('/content/titanic_dataset.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Setting Index**

In [122]:
data.set_index('PassengerId')

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


**Checking for Null values**

In [123]:
data.isna().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**Filling Null Values**

In [124]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Cabin'] = data['Cabin'].fillna(0)
data['Embarked'] = data['Embarked'].fillna(0)
data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

**Basic Details of the Dataset**

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [126]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


**Outliers**

Ouliers in Age

In [127]:
Q1_age = np.percentile(data['Age'],25, interpolation='midpoint')
Q2_age = np.percentile(data['Age'],50, interpolation='midpoint')
Q3_age = np.percentile(data['Age'],75, interpolation='midpoint')

In [128]:
IQR_age  = Q3_age - Q1_age
print(IQR_age)

13.0


In [129]:
low_lim_age = Q1_age-1.5*IQR_age
up_lim_age = Q3_age+1.5*IQR_age


In [130]:
print(low_lim_age)
print(up_lim_age)

2.5
54.5


In [131]:
outlier_age = []
for x in data['Age']:
  if((x>up_lim_age) or (x<low_lim_age)):
    outlier_age.append(x)

In [132]:
print('Ouliers in Age : ',outlier_age)


Ouliers in Age :  [2.0, 58.0, 55.0, 2.0, 66.0, 65.0, 0.83, 59.0, 71.0, 70.5, 2.0, 55.5, 1.0, 61.0, 1.0, 56.0, 1.0, 58.0, 2.0, 59.0, 62.0, 58.0, 63.0, 65.0, 2.0, 0.92, 61.0, 2.0, 60.0, 1.0, 1.0, 64.0, 65.0, 56.0, 0.75, 2.0, 63.0, 58.0, 55.0, 71.0, 2.0, 64.0, 62.0, 62.0, 60.0, 61.0, 57.0, 80.0, 2.0, 0.75, 56.0, 58.0, 70.0, 60.0, 60.0, 70.0, 0.67, 57.0, 1.0, 0.42, 2.0, 1.0, 62.0, 0.83, 74.0, 56.0]


In [133]:
#Removing Outliers
age_outliers = np.where(data["Age"] > up_lim_age, True,
                np.where(data["Age"] < low_lim_age, True, False))
age_without_outliers = data.loc[~(age_outliers)]

Outliers in Fare

In [134]:
Q1_fare = np.percentile(data['Fare'],25, interpolation='midpoint')
Q2_fare = np.percentile(data['Fare'],50, interpolation='midpoint')
Q3_fare = np.percentile(data['Fare'],75, interpolation='midpoint')

IQR_fare  = Q3_fare - Q1_fare
print('Inter-quartile Range')
print(IQR_fare)

low_lim_fare = Q1_fare-1.5*IQR_fare
up_lim_fare = Q3_fare+1.5*IQR_fare
print('Lower Limit: ',low_lim_fare)
print('Upper Limit: ',up_lim_fare)

print('Outliers in Fare:')
outlier_fare = []

for x in data['Fare']:
  if((x>up_lim_fare) or (x<low_lim_fare)):
    outlier_fare.append(x)

print(outlier_fare)


Inter-quartile Range
23.0896
Lower Limit:  -26.724
Upper Limit:  65.6344
Outliers in Fare:
[71.2833, 263.0, 146.5208, 82.1708, 76.7292, 80.0, 83.475, 73.5, 263.0, 77.2875, 247.5208, 73.5, 77.2875, 79.2, 66.6, 69.55, 69.55, 146.5208, 69.55, 113.275, 76.2917, 90.0, 83.475, 90.0, 79.2, 86.5, 512.3292, 79.65, 153.4625, 135.6333, 77.9583, 78.85, 91.0792, 151.55, 247.5208, 151.55, 110.8833, 108.9, 83.1583, 262.375, 164.8667, 134.5, 69.55, 135.6333, 153.4625, 133.65, 66.6, 134.5, 263.0, 75.25, 69.3, 135.6333, 82.1708, 211.5, 227.525, 73.5, 120.0, 113.275, 90.0, 120.0, 263.0, 81.8583, 89.1042, 91.0792, 90.0, 78.2667, 151.55, 86.5, 108.9, 93.5, 221.7792, 106.425, 71.0, 106.425, 110.8833, 227.525, 79.65, 110.8833, 79.65, 79.2, 78.2667, 153.4625, 77.9583, 69.3, 76.7292, 73.5, 113.275, 133.65, 73.5, 512.3292, 76.7292, 211.3375, 110.8833, 227.525, 151.55, 227.525, 211.3375, 512.3292, 78.85, 262.375, 71.0, 86.5, 120.0, 77.9583, 211.3375, 79.2, 69.55, 120.0, 93.5, 80.0, 83.1583, 69.55, 89.1042, 164.8

In [135]:
#Removing Outliers
fare_outliers = np.where(data["Fare"] > up_lim_fare, True,
                np.where(data["Fare"] < low_lim_fare, True, False))
fare_without_outliers = data.loc[~(fare_outliers)]

Outliers in SibSp

In [136]:
Q1_SibSp = np.percentile(data['SibSp'],25, interpolation='midpoint')
Q2_SibSp = np.percentile(data['SibSp'],50, interpolation='midpoint')
Q3_SibSp = np.percentile(data['SibSp'],75, interpolation='midpoint')

IQR_SibSp  = Q3_SibSp - Q1_SibSp
print('Inter-quartile Range')
print(IQR_SibSp)

low_lim_SibSp = Q1_SibSp-1.5*IQR_SibSp
up_lim_SibSp = Q3_SibSp+1.5*IQR_SibSp
print('Lower Limit: ',low_lim_SibSp)
print('Upper Limit: ',up_lim_SibSp)

print('Outliers in SibSp:')
outlier_SibSp = []

for x in data['SibSp']:
  if((x>up_lim_SibSp) or (x<low_lim_SibSp)):
    outlier_SibSp.append(x)

print(outlier_SibSp)

Inter-quartile Range
1.0
Lower Limit:  -1.5
Upper Limit:  2.5
Outliers in SibSp:
[3, 4, 3, 3, 4, 5, 3, 4, 5, 3, 3, 4, 8, 4, 4, 3, 8, 4, 8, 3, 4, 4, 4, 4, 8, 3, 3, 5, 3, 5, 3, 4, 4, 3, 3, 5, 4, 3, 4, 8, 4, 3, 4, 8, 4, 8]


In [137]:
#Removing Outliers
sibsp_outliers = np.where(data["SibSp"] > up_lim_SibSp, True,
                np.where(data["SibSp"] < low_lim_SibSp, True, False))
sibsp_without_outliers = data.loc[~(sibsp_outliers)]

**Min Max Scaling**

In [138]:
from sklearn.preprocessing import MinMaxScaler
#dropping categorical data
x = data.drop(['Survived', 'Sex', 'Embarked','Pclass','Ticket','Cabin','Name','PassengerId'],axis=1)
mms = MinMaxScaler(feature_range=(0,1))
x = mms.fit_transform(x)
x = pd.DataFrame(x,columns=['Age', 'SibSp', 'Parch', 'Fare'])
x

Unnamed: 0,Age,SibSp,Parch,Fare
0,0.271174,0.125,0.000000,0.014151
1,0.472229,0.125,0.000000,0.139136
2,0.321438,0.000,0.000000,0.015469
3,0.434531,0.125,0.000000,0.103644
4,0.434531,0.000,0.000000,0.015713
...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374
887,0.233476,0.000,0.000000,0.058556
888,0.346569,0.125,0.333333,0.045771
889,0.321438,0.000,0.000000,0.058556


In [139]:
x1 = data.drop(['Age', 'SibSp', 'Parch', 'Fare'],axis=1)
x = pd.concat([x,x1],axis=1)
x

Unnamed: 0,Age,SibSp,Parch,Fare,PassengerId,Survived,Pclass,Name,Sex,Ticket,Cabin,Embarked
0,0.271174,0.125,0.000000,0.014151,1,0,3,"Braund, Mr. Owen Harris",male,A/5 21171,0,S
1,0.472229,0.125,0.000000,0.139136,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,0.321438,0.000,0.000000,0.015469,3,1,3,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,0,S
3,0.434531,0.125,0.000000,0.103644,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,0.434531,0.000,0.000000,0.015713,5,0,3,"Allen, Mr. William Henry",male,373450,0,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,887,0,2,"Montvila, Rev. Juozas",male,211536,0,S
887,0.233476,0.000,0.000000,0.058556,888,1,1,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,0.346569,0.125,0.333333,0.045771,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,0,S
889,0.321438,0.000,0.000000,0.058556,890,1,1,"Behr, Mr. Karl Howell",male,111369,C148,C
