# Titanic Dataset - Data Preprocessing & EDA
This notebook covers data cleaning, preprocessing, and exploratory data analysis (EDA) for the Titanic dataset as part of the GUVI-HCL Hackathon.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load the dataset
df = pd.read_csv("titanic.csv")
df.head()

## 1. Basic Info & Summary Statistics

In [None]:
print(df.info())
df.describe(include='all')

## 2. Handling Missing Values

In [None]:
# Fill missing Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)
# Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
# Drop Cabin (too many missing values)
df.drop(columns=['Cabin'], inplace=True)
# Drop rows with any remaining missing values
df.dropna(inplace=True)
df.isnull().sum()

## 3. Feature Encoding & Transformation

In [None]:
# Convert categorical features to numeric
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# Normalize numerical features
scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df.head()

## 4. Data Visualization & Insights

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# Survival by Gender
sns.countplot(x='Survived', hue='Sex', data=df)
plt.title("Survival Count by Gender")
plt.show()

# Age distribution
sns.histplot(df['Age'], bins=20, kde=True)
plt.title("Age Distribution")
plt.show()

## 5. Summary
- Missing values handled for Age and Embarked.
- Cabin column dropped due to excessive missing data.
- Features like Sex and Embarked encoded.
- Age and Fare normalized.
- Visualizations generated for key insights.

This completes the data preprocessing and exploratory analysis phase.