# EDA_Statistical_Thinking

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visual styles for plots
sns.set_style('whitegrid')
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

try:
    file_path = '../insurance_claims_data.csv' 
    data = pd.read_csv(file_path)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check your file path.")
    
print("\n--- Initial Data Snapshot ---")
print(data.head())


## Section 2: Data Structure and Quality Assessment (Task 1.2)

### 2.1 Data Structure (KPI: Data Structure)
print("\n--- Data Information (Data Types and Non-Null Counts) ---")
data.info()

# Convert date columns to datetime objects
# Assuming 'The transaction date' is the name of the transaction date column
data['TransactionDate'] = pd.to_datetime(data['The transaction date'])
data = data.drop(columns=['The transaction date']) # Drop the original string column
print("\n'The transaction date' converted to datetime and renamed to 'TransactionDate'")

### 2.2 Data Quality Assessment (KPI: Missing Values)
print("\n--- Missing Value Check ---")
missing_data = data.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
print(missing_data)

### 2.3 Descriptive Statistics (KPI: Variability)
print("\n--- Descriptive Statistics for Financial/Numerical Features ---")
# Focus on key financial and numerical columns
numerical_cols = ['TotalPremium', 'TotalClaims', 'SumInsured', 'CustomValueEstimate', 'Cubiccapacity', 'Kilowatts']
print(data[numerical_cols].describe())

# Variability check: Focus on Standard Deviation (std) for TotalPremium and TotalClaims
premium_std = data['TotalPremium'].std()
claims_std = data['TotalClaims'].std()
print(f"\nVariability (Standard Deviation):")
print(f"TotalPremium: {premium_std:,.2f}")
print(f"TotalClaims: {claims_std:,.2f}")


## Section 3: Exploratory Data Analysis (EDA) - Initial Insights

### 3.1 Overall Loss Ratio Calculation
# Loss Ratio = TotalClaims / TotalPremium (summed over the portfolio)
total_premium = data['TotalPremium'].sum()
total_claims = data['TotalClaims'].sum()
overall_loss_ratio = total_claims / total_premium
print(f"\n--- Overall Portfolio Loss Ratio ---")
print(f"Overall Loss Ratio: {overall_loss_ratio:.4f} (or {overall_loss_ratio*100:.2f}%)")

# --- Continue your EDA here with the required Univariate, Bivariate, and Multivariate analysis ---
# E.g., Calculating Loss Ratio by Province, plotting histograms, etc.

# You can start calculating loss ratio by Province here:
print("\n--- Loss Ratio by Province ---")
province_risk = data.groupby('Province').agg(
    TotalPremium=('TotalPremium', 'sum'),
    TotalClaims=('TotalClaims', 'sum')
).reset_index()

province_risk['LossRatio'] = province_risk['TotalClaims'] / province_risk['TotalPremium']
province_risk = province_risk.sort_values(by='LossRatio', ascending=False)
print(province_risk[['Province', 'LossRatio']])

# You would then visualize this as one of your required plots.

#