# Exploratory Data Analysis (EDA)
Objective: Explore the dataset to uncover patterns, identify data quality issues, and form hypotheses that will guide feature engineering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

# Load the dataset
try:
    df = pd.read_csv('../data/raw/insurance.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the path.")

In [None]:
# Overview of the Data
print("First 5 rows of the dataset:")
display(df.head())

print("\nDataset Info:")
df.info()

print(f"\nShape of the dataset: {df.shape}")

In [None]:
# Summary Statistics
print("Summary Statistics for Numerical Features:")
display(df.describe())

print("\nSummary Statistics for Categorical Features:")
display(df.describe(include='object'))

In [None]:
# Visualize Distribution of Numerical Features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
plt.show()

In [None]:
# Distribution of Categorical Features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if categorical_cols:
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(categorical_cols):
        plt.subplot(3, 3, i + 1)
        sns.countplot(x=col, data=df)
        plt.title(f'Distribution of {col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
    plt.show()
else:
    print("No categorical columns found.")

In [None]:
# Correlation Analysis
plt.figure(figsize=(10, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Identifying Missing Values
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

In [None]:
# Outlier Detection
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot of {col}')
    plt.tight_layout()
plt.show()

## Top 3-5 Insights
1. **Insight 1**: [Replace with finding]
2. **Insight 2**: [Replace with finding]
3. **Insight 3**: [Replace with finding]
4. **Insight 4**: [Replace with finding]
5. **Insight 5**: [Replace with finding]