# Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

# Dynamic Path Setup
sys.path.append(os.path.abspath(os.path.join('..')))

from src import config
from src.data_loader import load_raw_data

# Visual Settings
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)
pd.set_option('display.max_columns', None)

In [None]:
# Uses the robust loader
df = load_raw_data()

### "Bird's Eye" View

In [None]:
print("--- Data Info ---")
df.info()

print("\n--- First 5 Rows ---")
display(df.head())

print("\n--- Duplicates ---")
print(f"Duplicate Rows: {df.duplicated().sum()}")

### Target Variable Analysis

In [None]:
target = config.TARGET_COLUMN

if target in df.columns:
    print(f"Analyzing Target: {target}")
    
    if df[target].dtype == 'object' or df[target].nunique() < 20:
        # It's likely Classification
        sns.countplot(x=target, data=df, palette='viridis')
        plt.title(f"Class Distribution: {target}")
        
        # Print Balance
        print(df[target].value_counts(normalize=True))
    else:
        # It's likely Regression
        sns.histplot(df[target], kde=True)
        plt.title(f"Distribution of {target}")
else:
    print(f"⚠️ Target column '{target}' not found in dataset. Check config.py.")
plt.show()

### Numeric Feature Distribution

In [None]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
# Remove target and ID-like columns if they exist
if target in num_cols: num_cols.remove(target)

print(f"Numeric Columns: {len(num_cols)}")

# Plot first 6 for sanity check
for col in num_cols[:6]:
    plt.figure(figsize=(10, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution: {col}")
    plt.show()

### Correlation Matrix

In [None]:
# Only compute correlation on numeric columns
if len(num_cols) > 1:
    plt.figure(figsize=(12, 8))
    corr = df[num_cols].corr()
    sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)
    plt.title("Correlation Matrix")
    plt.show()
else:
    print("Not enough numeric columns for correlation matrix.")