
# Titanic Dataset Preprocessing 🧹🚢

This notebook demonstrates essential data preprocessing techniques on the Titanic dataset, including missing value handling, encoding, scaling, and outlier detection.


## 🔹 Task 1: Load the Dataset

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load Titanic dataset from seaborn or local file
df = sns.load_dataset('titanic')
df.head()


## 🔹 Task 2: Handle Missing Values

In [None]:

# Check missing values
df.isnull().sum()

# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

# Fill numerical columns with median
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isnull().sum()


## 🔹 Task 3: Remove Duplicate Rows

In [None]:

# Check and remove duplicates
duplicates = df.duplicated().sum()
df = df.drop_duplicates()
print(f"Removed {duplicates} duplicate rows.")


## 🔹 Task 4: Encode Categorical Variables

In [None]:

# Convert boolean columns to int
df['alone'] = df['alone'].astype(int)
df['adult_male'] = df['adult_male'].astype(int)

# One-hot encoding for categorical columns
df = pd.get_dummies(df, columns=['sex', 'embarked', 'class', 'who', 'deck', 'embark_town'], drop_first=True)
df.head()


## 🔹 Task 5: Feature Scaling

In [None]:

# Normalize and standardize numerical columns
scaler_minmax = MinMaxScaler()
scaler_std = StandardScaler()

num_cols = ['age', 'fare', 'parch', 'sibsp']

df_minmax = df.copy()
df_minmax[num_cols] = scaler_minmax.fit_transform(df_minmax[num_cols])

df_std = df.copy()
df_std[num_cols] = scaler_std.fit_transform(df_std[num_cols])

# Display scaled data
df_minmax.head()


## 🔹 Task 6: Outlier Detection Using IQR

In [None]:

# IQR method to remove outliers
for col in ['age', 'fare']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

df.shape


## ✅ Final Cleaned Dataset

In [None]:

df.to_csv("cleaned_titanic.csv", index=False)
df.head()
