## Data Ingestion

In [None]:
from get_data import load_claim_data # type: ignore
import sys
sys.path.append('../data')  # Go up one level and into 'data'
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns
from turtle import pd

In [None]:
url = "https://data.ny.gov/resource/jshw-gkgu.json"
params = {
    "$limit": "50000"
}

claim_data = load_claim_data(url, params)
claim_data.head()

In [None]:
# view of all the columns through transpose method
claim_data.head().T

# LLM data EDA

In [None]:
# filtering and segmentation of raw data
LLM_data = claim_data[[
    'claim_identifier',          
    'claim_type',                 
    'claim_injury_type',          
    'accident_date',              
    'accident_ind',               
    'oiics_event_exposure_desc',  
    'oiics_nature_injury_desc',  
    'oiics_injury_source_desc',   
    'wcio_cause_of_injury_desc',  
    'wcio_nature_of_injury_desc', 
    'current_claim_status',       
    'gender', 
    'age_at_injury',   
    'district_name', 
    'zip_code'
]]


In [None]:
LLM_data.head()

In [None]:
LLM_data.shape

In [None]:
LLM_data.info()

In [None]:
LLM_data.isna().sum()

In [None]:

missing = LLM_data.isna().sum()
missing = missing[missing > 0].sort_values(ascending=True)

plt.figure(figsize=(10, 6))
missing.plot(kind='barh', color='skyblue')
plt.title("Missing Values per Column")
plt.xlabel("Number of Missing Values")
plt.ylabel("Column")
plt.tight_layout()
plt.show()

In [None]:
LLM_data_null = LLM_data[["accident_date", 
                          "oiics_event_exposure_desc", 
                          "oiics_nature_injury_desc",
                          "oiics_injury_source_desc",
                          "wcio_cause_of_injury_desc",
                          "wcio_nature_of_injury_desc"
]]

In [None]:
LLM_data_null.head()

In [None]:
LLM_data_null.sort_values(by='accident_date', ascending=False)

In [None]:

# Count frequency of each injury type
injury_counts = LLM_data["claim_injury_type"].value_counts()

# Plot vertical bar chart
plt.figure(figsize=(12, 6))
injury_counts.plot(kind='bar', color='steelblue')

# Add labels and title
plt.title("Frequency of Claim Injury Types", fontsize=14)
plt.xlabel("Injury Type", fontsize=12)
plt.ylabel("Number of Claims", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=LLM_data, x="claim_injury_type", hue="gender", order=LLM_data["claim_injury_type"].value_counts().index)
plt.title("Claim Injury Type by Gender")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Injury Type")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
pivot_table = LLM_data.pivot_table(
    index="district_name",
    columns="claim_injury_type",
    values="claim_identifier",  # or use `.size()` after groupby
    aggfunc='count',
    fill_value=0
)

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(pivot_table, cmap="Blues", linewidths=0.5)
plt.title("Heatmap: Claim Injury Type by District")
plt.xlabel("Injury Type")
plt.ylabel("District")
plt.tight_layout()
plt.show()


In [None]:
# Step 1: Convert 'age_at_injury' to numeric
LLM_data['age_at_injury'] = pd.to_numeric(LLM_data['age_at_injury'], errors='coerce')

# Step 2: Drop rows where age is missing (or alternatively, fill with median)
LLM_data = LLM_data.dropna(subset=['age_at_injury'])
# Optional alternative:
# LLM_data['age_at_injury'].fillna(LLM_data['age_at_injury'].median(), inplace=True)

# Step 3: Define age bins and labels
bins = [0, 18, 30, 45, 60, 75, 100]
labels = ['<18', '18–29', '30–44', '45–59', '60–74', '75+']

# Step 4: Create age group column
LLM_data['age_group'] = pd.cut(LLM_data['age_at_injury'], bins=bins, labels=labels, right=False)

# Step 5: Check result
print(LLM_data['age_group'].value_counts().sort_index())

# Step 6 (Optional): Visualize age group distribution
plt.figure(figsize=(10, 5))
sns.countplot(data=LLM_data, x='age_group', order=labels, palette='viridis')
plt.title("Number of Claims per Age Group")
plt.xlabel("Age Group")
plt.ylabel("Number of Claims")
plt.tight_layout()
plt.show()




In [None]:
# Ensure date is in datetime format
LLM_data['accident_date'] = pd.to_datetime(LLM_data['accident_date'], errors='coerce')

# Extract full month name and categorize for correct order
LLM_data['accident_month'] = pd.Categorical(
    LLM_data['accident_date'].dt.month_name(),
    categories=['January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December'],
    ordered=True
)

# Count number of claims per month
monthly_claims = LLM_data['accident_month'].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=monthly_claims.index, y=monthly_claims.values, palette='crest')
plt.title('Number of Claims by Month')
plt.xlabel('Month')
plt.ylabel('Number of Claims')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Step 1: Ensure accident_date is in datetime format
LLM_data['accident_date'] = pd.to_datetime(LLM_data['accident_date'], errors='coerce')

# Step 2: Extract and categorize month names
LLM_data['accident_month'] = pd.Categorical(
    LLM_data['accident_date'].dt.month_name(),
    categories=['January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December'],
    ordered=True
)

# Step 3: Create pivot table (rows: months, columns: injury types, values: counts)
pivot = pd.pivot_table(
    LLM_data,
    index='accident_month',
    columns='claim_injury_type',
    values='claim_identifier',  # could use any consistent non-null column
    aggfunc='count',
    fill_value=0
)

# Step 4: Plot the heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(pivot, cmap='YlGnBu', linewidths=0.5, annot=False)
plt.title('Heatmap: Claim Injury Type by Month')
plt.xlabel('Claim Injury Type')
plt.ylabel('Accident Month')
plt.tight_layout()
plt.show()
