# AI/ML Internship Task 1: Data Cleaning & Preprocessing

**Objective:** Clean and prepare the Titanic dataset for machine learning.

**Tools:** Python, Pandas, NumPy, Matplotlib, Seaborn

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Step 2: Load the Titanic dataset
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df.head()

In [None]:
# Step 3: Explore the dataset
print("Shape:", df.shape)
print("\nInfo:")
print(df.info())
print("\nSummary:")
print(df.describe(include='all'))
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Step 4: Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)

In [None]:
# Step 5: Encode categorical variables
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df.drop(columns=['Name', 'Ticket'], inplace=True)

In [None]:
# Step 6: Normalize numerical features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
df[num_features] = scaler.fit_transform(df[num_features])

In [None]:
# Step 7: Visualize outliers
plt.figure(figsize=(10, 5))
for i, col in enumerate(num_features):
    plt.subplot(1, 4, i+1)
    sns.boxplot(y=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# Remove outliers using IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

df = remove_outliers(df, num_features)

In [None]:
# Step 8: Final dataset overview
print("Final dataset shape:", df.shape)
df.head()

In [None]:
# Save cleaned dataset
df.to_csv("titanic_cleaned.csv", index=False)