# Detailed Exploratory Data Analysis - ACIS

This notebook covers the detailed EDA requirements including data cleaning, outlier detection, trend analysis, and creative visualizations.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))

from src.loader import load_data
from src.eda import check_missing_values, get_descriptive_stats

# Set Style
sns.set_theme(style="whitegrid")

# Load Data
filepath = '../data/raw/MachineLearningRating_v3.txt'
df = load_data(filepath, delimiter='|')

if df is not None:
    # Data Cleaning
    df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], errors='coerce')
    df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
    df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
    print("Data Loaded & Cleaned. Shape:", df.shape)

## 1. Data Summarization & Structure

In [None]:
print(df.info())
print("\nVariability (Std Dev):\n", df[['TotalPremium', 'TotalClaims']].std())

## 2. Outlier Detection (Box Plots)

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['TotalPremium'], color='skyblue')
plt.title('Total Premium Box Plot')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['TotalClaims'], color='salmon')
plt.title('Total Claims Box Plot')
plt.tight_layout()
plt.show()

## 3. Creative Plots & Trends

In [None]:
# Monthly Trends
monthly_trends = df.groupby('TransactionMonth')[['TotalPremium', 'TotalClaims']].sum().reset_index()
plt.figure(figsize=(14, 7))
sns.lineplot(x='TransactionMonth', y='TotalPremium', data=monthly_trends, label='Total Premium')
sns.lineplot(x='TransactionMonth', y='TotalClaims', data=monthly_trends, label='Total Claims', color='red')
plt.title('Monthly Trends: Premium vs Claims')
plt.show()

In [None]:
# Violin Plot by Province
top_provinces = df['Province'].value_counts().nlargest(5).index
df_top = df[df['Province'].isin(top_provinces)]
plt.figure(figsize=(14, 8))
sns.violinplot(x='Province', y='TotalClaims', data=df_top, palette='viridis')
plt.ylim(-1000, 50000)
plt.title('Claims Distribution (Violin) by Top 5 Provinces')
plt.show()

In [None]:
# Correlation Matrix
numeric_df = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()