In [1]:
#!pip install matplotlib 

In [2]:
#!pip install seaborn 

In [26]:
#!pip install scipy

In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('archive/titanic.csv')

##  Data Understanding

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [24]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Drop Irrelevant or Redundant Columns

In [25]:
df.drop(['PassengerId','Name','Cabin', 'Ticket', ], axis=1, inplace=True)

In [26]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Handle Missing Values

In [27]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [28]:
df['Age'].fillna(df['Age'].median(), inplace=True)   # Fill age with median
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Fill embarked with mode
df.dropna(subset=['Fare'], inplace=True)  # Drop if Fare is missing

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)   # Fill age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Fill embarked with mode


# Convert Categorical Columns

In [29]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode Embarked
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [30]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,False,True
1,1,1,1,38.0,1,0,71.2833,False,False
2,1,3,1,26.0,0,0,7.925,False,True
3,1,1,1,35.0,1,0,53.1,False,True
4,0,3,0,35.0,0,0,8.05,False,True


# Feature Engineering 
Create a new feature: FamilySize

In [31]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [32]:
bins_age = [0, 12, 18, 35, 60, 100]
labels_age = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']

df['Age_Bin'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age)

bins_fare = [0, 7.91, 14.45, 31, 600]
labels_fare = ['Low', 'Medium', 'High', 'Very High']

df['Fare_Bin'] = pd.cut(df['Fare'], bins=bins_fare, labels=labels_fare)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,FamilySize,Age_Bin,Fare_Bin
0,0,3,0,22.0,1,0,7.25,False,True,2,Young Adult,Low
1,1,1,1,38.0,1,0,71.2833,False,False,2,Adult,Very High
2,1,3,1,26.0,0,0,7.925,False,True,1,Young Adult,Medium
3,1,1,1,35.0,1,0,53.1,False,True,2,Young Adult,Very High
4,0,3,0,35.0,0,0,8.05,False,True,1,Young Adult,Medium


In [33]:
df['Age_Bin'] = df['Age_Bin'].map({
    'Child': 0,
    'Teen': 1,
    'Young Adult': 2,
    'Adult': 3,
    'Senior': 4
})

df['Fare_Bin'] = df['Fare_Bin'].map({
    'Low': 0,
    'Medium': 1,
    'High': 2,
    'Very High': 3
})

In [34]:
df.drop(['Age','Fare'], axis=1, inplace=True)

In [35]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,FamilySize,Age_Bin,Fare_Bin
0,0,3,0,1,0,False,True,2,2,0
1,1,1,1,1,0,False,False,2,3,3
2,1,3,1,0,0,False,True,1,2,1
3,1,1,1,1,0,False,True,2,2,3
4,0,3,0,0,0,False,True,1,2,1


In [36]:
df['Embarked_Q'] = df['Embarked_Q'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)

In [37]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,FamilySize,Age_Bin,Fare_Bin
0,0,3,0,1,0,0,1,2,2,0
1,1,1,1,1,0,0,0,2,3,3
2,1,3,1,0,0,0,1,1,2,1
3,1,1,1,1,0,0,1,2,2,3
4,0,3,0,0,0,0,1,1,2,1
