# Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Dynamic Path Setup
sys.path.append(os.path.abspath(os.path.join('..')))

from src import config

# Visual Settings
sns.set_style("whitegrid")

plt.rcParams["figure.figsize"] = (12, 6)pd.set_option('display.max_columns', None)

In [None]:
RAW_DATA_PATH = '../data/raw/dataset.csv' # CHANGE THIS
df_raw = pd.read_csv(RAW_DATA_PATH)

print(f"Raw data shape: {df_raw.shape}")

### Train-Test Split

In [None]:
TARGET_COL = 'target' # Define the target variable here

# Defaulting to a standard 80/20 split. Adjust stratify for classification tasks.
df_train, df_test = train_test_split(
    df_raw, 
    test_size=0.2, 
    random_state=42,
    stratify=df_raw[TARGET_COL] if df_raw[TARGET_COL].dtype == 'object' else None
)

print(f"Training data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape} (Set aside until final evaluation)")

# For the rest of the notebook, we strictly use df_train
df = df_train.copy()

### "Bird's Eye" View

In [None]:
# Peek at the data
display(df.head())

# Data types and missing values
info_df = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df)) * 100,
    'Unique Values': df.nunique()
})
display(info_df.sort_values(by='Missing %', ascending=False))

# Baseline statistical summary
display(df.describe(include='all'))

### Target Variable Analysis

In [None]:
# Separate features by type for programmatic plotting
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

if TARGET_COL in num_cols: num_cols.remove(TARGET_COL)
if TARGET_COL in cat_cols: cat_cols.remove(TARGET_COL)

# Target Distribution
plt.figure(figsize=(8, 4))
if df[TARGET_COL].dtype in ['int64', 'float64']:
    sns.histplot(df[TARGET_COL], kde=True)
else:
    sns.countplot(x=df[TARGET_COL])
plt.title(f"Target Variable Distribution: {TARGET_COL}")
plt.show()

### Numeric Feature Distribution

In [None]:
if num_cols:
    cols_per_row = 4
    n_rows = (len(num_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(18, n_rows * 4))
    axes = axes.flatten() # Safe because cols_per_row > 1
    
    for idx, col in enumerate(num_cols):
        axes[idx].boxplot(df[col].dropna(), vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
        axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Value')
        axes[idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(num_cols), len(axes)):
        axes[idx].set_visible(False)
        
    plt.tight_layout()
    plt.show()

In [None]:
# Skewness and Kurtosis for numeric features
if num_cols:
    dist_stats = []
    for col in num_cols:
        dist_stats.append({
            'Feature': col,
            'Skewness': round(skew(df[col].dropna()), 3),
            'Kurtosis': round(kurtosis(df[col].dropna()), 3)
        })
    
    dist_df = pd.DataFrame(dist_stats)
    display(dist_df.sort_values(by='Skewness', key=abs, ascending=False))
    
    # Histogram + KDE plots for numeric features
    cols_per_row = 4
    n_rows = (len(num_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(18, n_rows * 4))
    axes = axes.flatten()
    
    for idx, col in enumerate(num_cols):
        sns.histplot(df[col].dropna(), kde=True, ax=axes[idx], color='skyblue')
        axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(num_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numeric columns for distribution analysis.")

### Categorical Feature Distribution

In [None]:
if cat_cols:
    cols_per_row = 3
    n_rows = (len(cat_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(18, n_rows * 4))
    axes = axes.flatten()
    
    MAX_CATEGORIES = 15 # Safety valve for high-cardinality columns
    
    for idx, col in enumerate(cat_cols):
        value_counts = df[col].value_counts()
        
        # Apply safety valve
        if len(value_counts) > MAX_CATEGORIES:
            top_counts = value_counts.iloc[:MAX_CATEGORIES]
            other_count = pd.Series({'...Others': value_counts.iloc[MAX_CATEGORIES:].sum()})
            value_counts = pd.concat([top_counts, other_count])
            
        axes[idx].bar(range(len(value_counts)), value_counts.values, color='steelblue')
        axes[idx].set_xticks(range(len(value_counts)))
        axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Count')
        axes[idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(cat_cols), len(axes)):
        axes[idx].set_visible(False)
        
    plt.tight_layout()
    plt.show()

### Correlation Matrix

In [None]:
# Numeric Correlation Matrix
if len(num_cols) > 1:
    plt.figure(figsize=(12, 8))
    corr_matrix = df[num_cols + ([TARGET_COL] if df[TARGET_COL].dtype in ['int64', 'float64'] else [])].corr()
    
    # Mask upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title("Numeric Feature Correlation Matrix")
    plt.show()

### Missing Data Patterns

In [None]:
# Missing value heatmap
missing_data = df.isnull()

if missing_data.any().any():
    plt.figure(figsize=(12, 6))
    sns.heatmap(missing_data, cbar=True, cmap='viridis', yticklabels=False)
    plt.title("Missing Data Heatmap (Yellow = Missing)")
    plt.xlabel("Features")
    plt.tight_layout()
    plt.show()
    
    # Missing data correlation
    if missing_data.sum().sum() > 0:
        missing_corr = missing_data.corr()
        
        # Only show if there's meaningful correlation between missing values
        if (missing_corr.abs() > 0.5).sum().sum() > len(missing_corr):
            plt.figure(figsize=(10, 8))
            sns.heatmap(missing_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
            plt.title("Missing Data Correlation (Do missing values occur together?)")
            plt.tight_layout()
            plt.show()
else:
    print("‚úì No missing data detected in the training set.")

### Target Relationship Analysis

In [None]:
# Numeric Features vs Target
if num_cols and df[TARGET_COL].dtype in ['int64', 'float64']:
    # For regression: scatter plots
    cols_per_row = 3
    n_rows = (len(num_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(18, n_rows * 4))
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, col in enumerate(num_cols):
        axes[idx].scatter(df[col], df[TARGET_COL], alpha=0.5)
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel(TARGET_COL)
        axes[idx].set_title(f'{col} vs {TARGET_COL}')
        axes[idx].grid(True, alpha=0.3)
    
    for idx in range(len(num_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

elif num_cols and df[TARGET_COL].dtype == 'object':
    # For classification: box plots by target class
    cols_per_row = 3
    n_rows = (len(num_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(18, n_rows * 4))
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, col in enumerate(num_cols):
        df.boxplot(column=col, by=TARGET_COL, ax=axes[idx])
        axes[idx].set_title(f'{col} by {TARGET_COL}')
        axes[idx].set_xlabel(TARGET_COL)
        axes[idx].set_ylabel(col)
    
    for idx in range(len(num_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical Features vs Target
if cat_cols:
    cols_per_row = 2
    n_rows = (len(cat_cols) + cols_per_row - 1) // cols_per_row
    
    fig, axes = plt.subplots(n_rows, cols_per_row, figsize=(16, n_rows * 5))
    if n_rows == 1 and cols_per_row == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, col in enumerate(cat_cols):
        # Cross-tabulation of categorical feature with target
        ct = pd.crosstab(df[col], df[TARGET_COL], normalize='index') * 100
        ct.plot(kind='bar', ax=axes[idx], stacked=False)
        axes[idx].set_title(f'{col} vs {TARGET_COL}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Percentage' if df[TARGET_COL].dtype == 'object' else 'Count')
        axes[idx].legend(title=TARGET_COL)
        axes[idx].grid(True, alpha=0.3)
    
    for idx in range(len(cat_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

### Outlier Detection (IQR Method)

In [None]:
# Identify outliers using IQR method
if num_cols:
    outlier_summary = []
    
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_pct = (outlier_count / len(df)) * 100
        
        outlier_summary.append({
            'Feature': col,
            'Outlier Count': outlier_count,
            'Outlier %': round(outlier_pct, 2),
            'Lower Bound': round(lower_bound, 2),
            'Upper Bound': round(upper_bound, 2)
        })
    
    outlier_df = pd.DataFrame(outlier_summary).sort_values(by='Outlier %', ascending=False)
    display(outlier_df)
    
    # Visualize outlier percentage
    if outlier_df['Outlier %'].sum() > 0:
        plt.figure(figsize=(10, 6))
        plt.barh(outlier_df['Feature'], outlier_df['Outlier %'], color='coral')
        plt.xlabel('Outlier Percentage (%)')
        plt.title('Outlier Detection by Feature (IQR Method)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
else:
    print("No numeric columns to check for outliers.")

### Temporal Analysis (If Applicable)

In [None]:
# Detect and analyze time-based features
date_cols = []
for col in df.columns:
    # Try to detect date columns by attempting to convert to datetime
    if df[col].dtype == 'object':
        try:
            pd.to_datetime(df[col], errors='raise')
            date_cols.append(col)
        except:
            pass

if date_cols:
    print(f"Detected potential date columns: {date_cols}")
    
    for col in date_cols:
        df[f'{col}_datetime'] = pd.to_datetime(df[col])
        
        # Time series plot of target over time
        plt.figure(figsize=(14, 5))
        
        if df[TARGET_COL].dtype in ['int64', 'float64']:
            # For regression: line plot of target over time
            df_time = df.sort_values(f'{col}_datetime')
            plt.plot(df_time[f'{col}_datetime'], df_time[TARGET_COL], alpha=0.7)
            plt.title(f'{TARGET_COL} over Time ({col})')
            plt.xlabel(col)
            plt.ylabel(TARGET_COL)
        else:
            # For classification: stacked area chart of target classes over time
            df_time = df.groupby([pd.Grouper(key=f'{col}_datetime', freq='M'), TARGET_COL]).size().unstack(fill_value=0)
            df_time.plot(kind='area', stacked=True, alpha=0.7)
            plt.title(f'{TARGET_COL} Distribution over Time ({col})')
            plt.xlabel(col)
            plt.ylabel('Count')
        
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
else:
    print("No temporal features detected. If you have date columns, convert them manually.")

### Cardinality Check

In [None]:
# Identify high-cardinality categorical features
if cat_cols:
    cardinality_info = []
    
    for col in cat_cols:
        unique_count = df[col].nunique()
        cardinality_pct = (unique_count / len(df)) * 100
        
        cardinality_info.append({
            'Feature': col,
            'Unique Values': unique_count,
            'Cardinality %': round(cardinality_pct, 2),
            'Type': 'High' if unique_count > 50 else ('Medium' if unique_count > 10 else 'Low')
        })
    
    cardinality_df = pd.DataFrame(cardinality_info).sort_values(by='Unique Values', ascending=False)
    display(cardinality_df)
    
    # Flag high-cardinality features that may need special encoding
    high_card_features = cardinality_df[cardinality_df['Type'] == 'High']['Feature'].tolist()
    if high_card_features:
        print(f"\n‚ö†Ô∏è High-cardinality features detected: {high_card_features}")
        print("   Consider: Target encoding, frequency encoding, or feature hashing.")
else:
    print("No categorical columns to check for cardinality.")

### Class Imbalance Analysis

In [None]:
# Check for class imbalance (for classification tasks)
if df[TARGET_COL].dtype == 'object' or df[TARGET_COL].nunique() < 20:
    value_counts = df[TARGET_COL].value_counts()
    value_pcts = df[TARGET_COL].value_counts(normalize=True) * 100
    
    imbalance_df = pd.DataFrame({
        'Class': value_counts.index,
        'Count': value_counts.values,
        'Percentage': [f"{pct:.2f}%" for pct in value_pcts.values]
    })
    
    print("Class Distribution:")
    display(imbalance_df)
    
    # Calculate imbalance ratio
    if len(value_counts) == 2:
        imbalance_ratio = value_counts.max() / value_counts.min()
        print(f"\nImbalance Ratio: {imbalance_ratio:.2f}:1")
        
        if imbalance_ratio > 3:
            print("‚ö†Ô∏è Significant class imbalance detected!")
            print("   Consider: SMOTE, class weights, or stratified sampling.")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    axes[0].bar(range(len(value_counts)), value_counts.values, color='steelblue')
    axes[0].set_xticks(range(len(value_counts)))
    axes[0].set_xticklabels(value_counts.index, rotation=45)
    axes[0].set_title('Class Distribution (Count)')
    axes[0].set_ylabel('Count')
    axes[0].grid(True, alpha=0.3)
    
    # Pie chart
    axes[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Class Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()
else:
    print("Target variable is continuous (regression task). Class imbalance analysis not applicable.")

### Multicollinearity Check

In [None]:
# VIF (Variance Inflation Factor) Analysis
if len(num_cols) > 1:
    # Prepare data for VIF (drop NaNs)
    vif_data = df[num_cols].dropna()
    
    if len(vif_data) > 0 and vif_data.shape[1] > 1:
        vif_values = []
        
        for i, col in enumerate(vif_data.columns):
            try:
                vif = variance_inflation_factor(vif_data.values, i)
                vif_values.append({
                    'Feature': col,
                    'VIF': round(vif, 2)
                })
            except:
                vif_values.append({
                    'Feature': col,
                    'VIF': 'Error'
                })
        
        vif_df = pd.DataFrame(vif_values).sort_values(by='VIF', ascending=False, key=lambda x: pd.to_numeric(x, errors='coerce'))
        display(vif_df)
        
        print("\nüí° VIF Interpretation:")
        print("   VIF < 5: Low multicollinearity")
        print("   VIF 5-10: Moderate multicollinearity")
        print("   VIF > 10: High multicollinearity (consider removing)")
    
    # High correlation pairs
    print("\n" + "="*50)
    print("High Correlation Pairs (|r| > 0.8):")
    print("="*50)
    
    corr_matrix = df[num_cols].corr()
    high_corr_pairs = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.8:
                high_corr_pairs.append({
                    'Feature 1': corr_matrix.columns[i],
                    'Feature 2': corr_matrix.columns[j],
                    'Correlation': round(corr_val, 3)
                })
    
    if high_corr_pairs:
        high_corr_df = pd.DataFrame(high_corr_pairs).sort_values(by='Correlation', key=abs, ascending=False)
        display(high_corr_df)
        print("\n‚ö†Ô∏è Consider removing one feature from each highly correlated pair.")
    else:
        print("‚úì No highly correlated feature pairs detected (|r| > 0.8).")
else:
    print("Not enough numeric columns for multicollinearity analysis.")