## Data Ingestion

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
from get_data import load_claim_data # type: ignore

import matplotlib.pyplot as plt # type: ignore
import seaborn as sns
import numpy as np
import pandas as pd
from pipeline.preprocessing import clean_claim_data, add_claim_features # type: ignore


import warnings
warnings.filterwarnings("ignore")

In [None]:
url = "https://data.ny.gov/resource/jshw-gkgu.json"
params = {
    "$limit": "50000"
}

claim_data = load_claim_data(url, params)
claim_data.head()

In [None]:
# view of all the columns through transpose method
claim_data.head().T

In [None]:
# filtering and segmentation of raw data
LLM_data = claim_data[[
    'claim_identifier',          
    'claim_type',                 
    'claim_injury_type',          
    'accident_date',              
    'accident_ind',               
    'oiics_event_exposure_desc',  
    'oiics_nature_injury_desc',  
    'oiics_injury_source_desc',   
    'wcio_cause_of_injury_desc',  
    'wcio_nature_of_injury_desc', 
    'current_claim_status',       
    'gender', 
    'age_at_injury',   
    'district_name', 
    'zip_code'
]]


In [None]:
LLM_data.head()

In [None]:
LLM_data.shape

In [None]:
LLM_data.info()

In [None]:
LLM_data.isna().sum()

In [None]:

# visualization of fields with missing valued
missing = LLM_data.isna().sum()
missing = missing[missing > 0].sort_values(ascending=True)

plt.figure(figsize=(10, 6))
missing.plot(kind='barh', color='skyblue')
plt.title("Missing Values per Column")
plt.xlabel("Number of Missing Values")
plt.ylabel("Column")
plt.tight_layout()
plt.show()

In [None]:
#cleaning data

Cleaned_LLM_data = clean_claim_data(LLM_data, null_threshold=0.5)


In [None]:
LLM_claim_data = add_claim_features(Cleaned_LLM_data)

In [None]:
LLM_claim_data.head()

In [None]:
# Set plot style
plt.figure(figsize=(12, 6))
sns.countplot(
    data=LLM_claim_data,
    x='age_group',
    hue='claim_type',
    palette='Set2'
)

# Enhance plot
plt.title("Distribution of Claim Types by Age Group", fontsize=14)
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Number of Claims", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Claim Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:

# Count frequency of each injury type
injury_counts = LLM_claim_data["claim_injury_type"].value_counts()

# Plot vertical bar chart
plt.figure(figsize=(12, 6))
injury_counts.plot(kind='bar', color='steelblue')

# Add labels and title
plt.title("Frequency of Claim Injury Types", fontsize=14)
plt.xlabel("Injury Type", fontsize=12)
plt.ylabel("Number of Claims", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=LLM_claim_data, x="claim_injury_type", hue="gender", order=LLM_claim_data["claim_injury_type"].value_counts().index)
plt.title("Claim Injury Type by Gender")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Injury Type")
plt.ylabel("Number of claims")
plt.tight_layout()
plt.show()


In [None]:
# Create a cross-tabulation table (district_name x claim_type)
heatmap_data = pd.crosstab(LLM_claim_data['district_name'], LLM_claim_data['claim_type'])

# Optional: sort rows by total claims
heatmap_data = heatmap_data.loc[heatmap_data.sum(axis=1).sort_values(ascending=False).index[:15]]

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu', linewidths=.5)

plt.title("Heatmap of Claim Types by District Name", fontsize=14)
plt.xlabel("Claim Type")
plt.ylabel("District Name")
plt.tight_layout()
plt.show()

In [None]:
# Ensure accident_date is in datetime format
LLM_claim_data['accident_date'] = pd.to_datetime(LLM_claim_data['accident_date'], errors='coerce')

# Drop missing dates
cumulative_df = LLM_claim_data.dropna(subset=['accident_date'])

# Group by date and compute cumulative count
cumulative_df = (
    cumulative_df
    .groupby('accident_date')
    .size()
    .cumsum()
    .reset_index(name='cumulative_claims')
)

# Plot area chart
plt.figure(figsize=(12, 6))
plt.fill_between(cumulative_df['accident_date'], cumulative_df['cumulative_claims'], alpha=0.4)
plt.plot(cumulative_df['accident_date'], cumulative_df['cumulative_claims'], linewidth=2)

# Enhance plot
plt.title("📈 Cumulative Growth of Insurance Claims Over Time", fontsize=14)
plt.xlabel("Accident Date")
plt.ylabel("Cumulative Number of Claims")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Convert to datetime
LLM_claim_data['accident_date'] = pd.to_datetime(LLM_claim_data['accident_date'], errors='coerce')

LLM_claim_data['current_date'] = pd.to_datetime("today")

# Calculate delay in days
LLM_claim_data['delay_days'] = (LLM_claim_data['current_date'] - LLM_claim_data['accident_date']).dt.days

# Extract month and day for calendar-style plot
LLM_claim_data['accident_month'] = LLM_claim_data['accident_date'].dt.month
LLM_claim_data['accident_day'] = LLM_claim_data['accident_date'].dt.day

# Create pivot table for heatmap (average delay by day/month)
calendar_data = (
    LLM_claim_data
    .groupby(['accident_month', 'accident_day'])['delay_days']
    .mean()
    .unstack()
)

# Plot the calendar-style heatmap
plt.figure(figsize=(16, 6))
sns.heatmap(calendar_data, cmap='YlOrRd', linewidths=0.5, linecolor='gray', annot=False)

# Enhance plot
plt.title("Average Claim Delay (in Days) by Accident Date", fontsize=14)
plt.xlabel("Day of Month")
plt.ylabel("Month")
plt.tight_layout()
plt.show()

In [None]:
# Visualize age group distribution
labels = ['<18', '18–29', '30–44', '45–59', '60–74', '75+']

plt.figure(figsize=(10, 5))
sns.countplot(data=LLM_claim_data, x='age_group', order=labels, palette='viridis')
plt.title("Number of Claims per Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Claims")
plt.tight_layout()
plt.show()

In [None]:
# Count number of claims per month
monthly_claims = LLM_claim_data['accident_month'].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=monthly_claims.index, y=monthly_claims.values, palette='crest')
plt.title('Number of Claims by Month')
plt.xlabel('Month')
plt.ylabel('Number of Claims')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
