In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

# Load the data
df = pd.read_csv('C:\Users\YEADONAY\acis-insurance-analytics\data\raw\insurance_data.csv')

# Show first few rows
df.head()


In [None]:
# Summary of numerical features
df.describe()


In [None]:
# Check data types
df.dtypes


In [None]:
# Check shape and column names
print("Shape:", df.shape)
df.columns.tolist()


In [None]:
# Count of missing values
df.isnull().sum().sort_values(ascending=False)

# Percentage of missing
df.isnull().mean().sort_values(ascending=False) * 100


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['TotalPremium'], bins=50, ax=axs[0], kde=True)
axs[0].set_title('Total Premium Distribution')

sns.histplot(df['TotalClaims'], bins=50, ax=axs[1], kde=True)
axs[1].set_title('Total Claims Distribution')

sns.histplot(df['CustomValueEstimate'], bins=50, ax=axs[2], kde=True)
axs[2].set_title('Custom Value Estimate')
plt.tight_layout()


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.countplot(x='Gender', data=df, ax=axs[0])
sns.countplot(x='Province', data=df, ax=axs[1])
sns.countplot(x='VehicleType', data=df, ax=axs[2])
plt.tight_layout()


In [None]:
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Loss Ratio by Gender
sns.boxplot(x='Gender', y='LossRatio', data=df)
plt.title('Loss Ratio by Gender')
plt.show()

# Loss Ratio by Province
plt.figure(figsize=(12, 5))
sns.boxplot(x='Province', y='LossRatio', data=df)
plt.title('Loss Ratio by Province')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Correlation heatmap
corr = df[['TotalPremium', 'TotalClaims', 'CustomValueEstimate', 'LossRatio']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Boxplots
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.boxplot(y=df['TotalPremium'], ax=axs[0])
axs[0].set_title('Total Premium Outliers')

sns.boxplot(y=df['TotalClaims'], ax=axs[1])
axs[1].set_title('Total Claims Outliers')

sns.boxplot(y=df['CustomValueEstimate'], ax=axs[2])
axs[2].set_title('Custom Value Estimate Outliers')

plt.tight_layout()
plt.show()


In [None]:
df['HasClaim'] = df['TotalClaims'] > 0
claim_freq = df.groupby('Province')['HasClaim'].mean().sort_values()

plt.figure(figsize=(12, 5))
sns.barplot(x=claim_freq.index, y=claim_freq.values)
plt.title('Claim Frequency by Province')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
make_claims = df.groupby('Make')['TotalClaims'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 5))
sns.barplot(x=make_claims.index, y=make_claims.values)
plt.title('Top 10 Vehicle Makes by Average Claims')
plt.xticks(rotation=45)
plt.ylabel('Avg Claims')
plt.show()


In [None]:
monthly = df.groupby('TransactionMonth')[['TotalPremium', 'TotalClaims']].sum()
monthly['LossRatio'] = monthly['TotalClaims'] / monthly['TotalPremium']

plt.figure(figsize=(14, 5))
monthly['LossRatio'].plot(marker='o')
plt.title('Monthly Loss Ratio Over Time')
plt.xlabel('TransactionMonth')
plt.ylabel('Loss Ratio')
plt.grid(True)
plt.show()
