In [3]:
import pandas as pd
from io import StringIO, BytesIO
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import base64

# --- ML Model Imports ---
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# --- Statistical Test Imports ---
from scipy import stats


# --- Section 1: Data Loading and Merging ---
# This section loads the two provided CSV files and merges them into a single DataFrame.

# Fetch the content of the CSV files from the environment
# Note: In a local Jupyter notebook, you would use pd.read_csv('claims.csv')
# and pd.read_csv('cust_demographics.csv') directly.
# Here, we simulate fetching file content as if from an external source.


# Load the datasets into pandas DataFrames
claims_df = pd.read_csv(r"C:\Users\DELL\OneDrive\Desktop\assignment\Case Study 3 - Insurance Claims Case Study\claims.csv")
cust_demographics_df = pd.read_csv(r"C:\Users\DELL\OneDrive\Desktop\assignment\Case Study 3 - Insurance Claims Case Study\cust_demographics.csv")

# Combine the two datasets appropriately using 'customer_id' as the common column.
# An inner merge ensures that only customers present in both datasets are included,
# creating a comprehensive 360-degree view of the data.
combined_df = pd.merge(claims_df, cust_demographics_df, on='customer_id', how='inner')

print("--- Data Loading and Merging Complete ---")
print(f"Combined DataFrame shape: {combined_df.shape}\n")


# --- Section 2: Data Audit ---
# This section performs a data audit to understand the structure, data types,
# and presence of missing values and unique values in the combined dataset.

data_audit = pd.DataFrame({
    'Column': combined_df.columns,
    'Dtype_Before': combined_df.dtypes,
    'Non_Null_Count': combined_df.notnull().sum(),
    'Null_Count': combined_df.isnull().sum(),
    'Unique_Values': combined_df.nunique()
})

print("--- Data Audit (Sample) ---")
print(data_audit.to_markdown(index=False))
print("\nObservations from Data Audit:")
print("- `claim_amount` and `total_policy_claims` are `object` type, likely due to non-numeric characters or 'NA' values.")
print("- `claim_date` and `DateOfBirth` are `object` type and need conversion to `datetime` for date calculations.")
print("- Missing values are present in `claim_amount` and `total_policy_claims`.\n")


# --- Section 3: Data Cleaning and Transformation ---
# This section addresses data quality issues and transforms columns as required
# for subsequent analysis.

# 3. Convert the column 'claim_amount' to numeric.
# Remove the '$' sign and then convert to float. 'errors=coerce' will turn
# any values that cannot be converted into NaN.
combined_df['claim_amount'] = combined_df['claim_amount'].astype(str).str.replace('$', '', regex=False)
combined_df['claim_amount'] = pd.to_numeric(combined_df['claim_amount'], errors='coerce')

# 4. Of all the injury claims, some of them have gone unreported with the police.
# Create an alert flag (1,0) for all such claims.
# First, standardize 'claim_category' based on 'claim_type' if 'injury' is present.
combined_df['claim_category'] = combined_df['claim_type'].apply(lambda x: 'Injury' if 'injury' in str(x).lower() else 'Material')

# Initialize the flag column with 0 (not unreported)
combined_df['unreported_injury_flag'] = 0
# Set the flag to 1 where claim_category is 'Injury' AND police_report is 'No'
combined_df.loc[(combined_df['claim_category'] == 'Injury') & (combined_df['police_report'] == 'No'), 'unreported_injury_flag'] = 1
print("--- Unreported Injury Flag Created ---")
print(f"Number of unreported injury claims: {combined_df['unreported_injury_flag'].sum()}\n")

# 5. Retain the most recent observation and delete any duplicated records in the data
# based on the customer ID column.
# Convert 'claim_date' to datetime for proper sorting.
# Using format='%m/%d/%Y' to handle the specific date format in the CSV.
combined_df['claim_date'] = pd.to_datetime(combined_df['claim_date'], errors='coerce', format='%m/%d/%Y')

# Sort by customer_id and then by claim_date in descending order to ensure
# the most recent claim comes first for each customer.
combined_df = combined_df.sort_values(by=['customer_id', 'claim_date'], ascending=[True, False])

# Drop duplicates based on 'customer_id', keeping the first occurrence (which is the most recent).
initial_rows = combined_df.shape[0]
combined_df = combined_df.drop_duplicates(subset='customer_id', keep='first')
print("--- Duplicate Customer Records Handled ---")
print(f"Removed {initial_rows - combined_df.shape[0]} duplicate customer records, keeping the most recent claim.\n")

# 6. Check for missing values and impute the missing values with an appropriate value.
# (mean for continuous and mode for categorical)
print("--- Imputing Missing Values ---")
for column in combined_df.columns:
    if combined_df[column].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(combined_df[column]):
            # Impute with mean for numeric columns
            mean_val = combined_df[column].mean()
            combined_df[column].fillna(mean_val, inplace=True)
            print(f"  - Imputed numeric column '{column}' with mean: {mean_val:.2f}")
        else:
            # Impute with mode for categorical columns
            mode_val = combined_df[column].mode()[0]
            combined_df[column].fillna(mode_val, inplace=True)
            print(f"  - Imputed categorical column '{column}' with mode: '{mode_val}'")
print("Missing value imputation complete.\n")

# 7. Calculate the age of customers in years. Based on the age, categorize the customers.
# Convert 'DateOfBirth' to datetime.
# Using format='%d-%b-%y' to handle the specific date format in the CSV.
combined_df['DateOfBirth'] = pd.to_datetime(combined_df['DateOfBirth'], errors='coerce', format='%d-%b-%y')

# Use a fixed reference date for age calculation, as specified in question 9.
reference_date = pd.to_datetime('2018-10-01')

# Calculate age in years. Using 365.25 for leap years.
combined_df['age'] = (reference_date - combined_df['DateOfBirth']).dt.days / 365.25

# Define a function to categorize age based on the given criteria.
def categorize_age(age):
    if age < 18:
        return 'Children'
    elif 18 <= age < 30:
        return 'Youth'
    elif 30 <= age < 60:
        return 'Adult'
    else:
        return 'Senior'

# Apply the categorization function to create the 'age_category' column.
combined_df['age_category'] = combined_df['age'].apply(categorize_age)
print("--- Age Calculated and Categorized ---")
print("Age categories created: Children (<18), Youth (18-30), Adult (30-60), Senior (>60).\n")


# --- Section 4: Data Analysis Questions ---
# This section answers specific analytical questions using the cleaned and transformed data.

# 8. What is the average amount claimed by the customers from various segments?
avg_claim_by_segment = combined_df.groupby('age_category')['claim_amount'].mean().reset_index()
# Define a specific order for age categories for consistent display
age_order = ['Children', 'Youth', 'Adult', 'Senior']
avg_claim_by_segment['age_category'] = pd.Categorical(avg_claim_by_segment['age_category'], categories=age_order, ordered=True)
avg_claim_by_segment = avg_claim_by_segment.sort_values('age_category')

print("--- Average Claim Amount by Age Segment ---")
print(avg_claim_by_segment.to_markdown(index=False))
print("\n")

# 9. What is the total claim amount based on incident cause for all the claims that have been
# done at least 20 days prior to 1st of October, 2018.
filter_date = pd.to_datetime('2018-10-01') - pd.Timedelta(days=20)
filtered_claims = combined_df[combined_df['claim_date'] <= filter_date]
total_claim_by_incident_cause = filtered_claims.groupby('incident_cause')['claim_amount'].sum().reset_index()

print("--- Total Claim Amount by Incident Cause (Filtered by Date) ---")
print(total_claim_by_incident_cause.to_markdown(index=False))
print("\n")

# 10. How many adults from TX, DE and AK claimed insurance for driver related issues and causes?
# Based on the data audit and common incident causes, 'Driver error' and 'Other driver error'
# are identified as driver-related issues.
driver_related_causes = ['Driver error', 'Other driver error']

adults_filtered = combined_df[
    (combined_df['age_category'] == 'Adult') &
    (combined_df['State'].isin(['TX', 'DE', 'AK'])) &
    (combined_df['incident_cause'].isin(driver_related_causes))
]
# Count unique customer IDs to get the number of distinct adults.
num_adults_driver_issues = adults_filtered['customer_id'].nunique()

print(f"--- Number of Adults from TX, DE, AK with Driver-Related Issues: {num_adults_driver_issues} ---\n")


# --- Section 5: Data Visualization ---
# This section generates the requested charts using Matplotlib and Seaborn.
# Each chart is saved as a base64 encoded PNG image for display.

# Dictionary to store base64 encoded images
images = {}

# 11. Draw a pie chart between the aggregated value of claim amount based on gender and segment.
# Represent the claim amount as a percentage on the pie chart.
claim_by_gender_segment = combined_df.groupby(['gender', 'age_category'])['claim_amount'].sum().reset_index()

plt.figure(figsize=(10, 8))
# Calculate percentage for each slice for direct display on labels
total_claim_amount_pie = claim_by_gender_segment['claim_amount'].sum()
percentages_pie = (claim_by_gender_segment['claim_amount'] / total_claim_amount_pie) * 100
# Create labels including gender, age category, and percentage
labels_pie = [f"{g} - {s} ({p:.1f}%)" for g, s, p in zip(claim_by_gender_segment['gender'], claim_by_gender_segment['age_category'], percentages_pie)]

plt.pie(claim_by_gender_segment['claim_amount'], labels=labels_pie, autopct='%1.1f%%', startangle=90, pctdistance=0.85)
plt.title('Total Claim Amount by Gender and Age Segment')
plt.axis('equal') # Ensures the pie chart is circular.
plt.tight_layout() # Adjusts plot to ensure everything fits without overlapping.
pie_chart_img = BytesIO()
plt.savefig(pie_chart_img, format='png')
pie_chart_img.seek(0)
images['pie_chart'] = base64.b64encode(pie_chart_img.read()).decode('utf-8')
plt.close() # Close the plot to free up memory

print("--- Pie Chart Generated (Base64 Encoded) ---")


# 12. Among males and females, which gender had claimed the most for any type of driver related issues?
# This metric can be compared using a bar chart.
driver_claims_by_gender = combined_df[combined_df['incident_cause'].isin(driver_related_causes)]
total_driver_claims_gender = driver_claims_by_gender.groupby('gender')['claim_amount'].sum().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(x='gender', y='claim_amount', data=total_driver_claims_gender, palette='viridis')
plt.title('Total Claim Amount for Driver-Related Issues by Gender')
plt.xlabel('Gender')
plt.ylabel('Total Claim Amount')
plt.tight_layout()
bar_gender_driver_img = BytesIO()
plt.savefig(bar_gender_driver_img, format='png')
bar_gender_driver_img.seek(0)
images['bar_gender_driver'] = base64.b64encode(bar_gender_driver_img.read()).decode('utf-8')
plt.close()

print("--- Bar Chart (Driver Claims by Gender) Generated (Base64 Encoded) ---")


# 13. Which age group had the maximum fraudulent policy claims? Visualize it on a bar chart.
# Assuming 'fraudulent' is the column indicating fraudulent claims ('Yes'/'No').
# Convert 'Yes' to 1 and 'No' to 0 for numerical analysis.
combined_df['fraudulent_policy'] = combined_df['fraudulent'].map({'Yes': 1, 'No': 0})

# Filter for fraudulent claims only
fraudulent_claims_by_age = combined_df[combined_df['fraudulent_policy'] == 1]
# Count the number of fraudulent claims per age category
fraudulent_claims_count_by_age = fraudulent_claims_by_age.groupby('age_category').size().reset_index(name='fraud_claim_count')

# Ensure age categories are in a sensible order for plotting
fraudulent_claims_count_by_age['age_category'] = pd.Categorical(fraudulent_claims_count_by_age['age_category'], categories=age_order, ordered=True)
fraudulent_claims_count_by_age = fraudulent_claims_count_by_age.sort_values('age_category')

plt.figure(figsize=(8, 6))
sns.barplot(x='age_category', y='fraud_claim_count', data=fraudulent_claims_count_by_age, palette='plasma')
plt.title('Number of Fraudulent Claims by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of Fraudulent Claims')
plt.tight_layout()
bar_fraud_age_img = BytesIO()
plt.savefig(bar_fraud_age_img, format='png')
bar_fraud_age_img.seek(0)
images['bar_fraud_age'] = base64.b64encode(bar_fraud_age_img.read()).decode('utf-8')
plt.close()

print("--- Bar Chart (Fraudulent Claims by Age) Generated (Base64 Encoded) ---")


# 14. Visualize the monthly trend of the total amount that has been claimed by the customers.
# Ensure that on the “month” axis, the month is in a chronological order not alphabetical order.
# Extract month and year as a Period object for chronological sorting
combined_df['claim_month'] = combined_df['claim_date'].dt.to_period('M')
# Group by month and sum claim amounts
monthly_claim_trend = combined_df.groupby('claim_month')['claim_amount'].sum().reset_index()
# Convert Period to string for plotting, preserving chronological order
monthly_claim_trend['claim_month_str'] = monthly_claim_trend['claim_month'].astype(str)

# Sort chronologically by the Period object, then use the string for plotting
monthly_claim_trend = monthly_claim_trend.sort_values('claim_month')

plt.figure(figsize=(12, 6))
sns.lineplot(x='claim_month_str', y='claim_amount', data=monthly_claim_trend, marker='o')
plt.title('Monthly Trend of Total Claim Amount')
plt.xlabel('Month')
plt.ylabel('Total Claim Amount')
plt.xticks(rotation=45) # Rotate x-axis labels for better readability
plt.grid(True, linestyle='--', alpha=0.7) # Add a grid for better readability
plt.tight_layout()
line_monthly_trend_img = BytesIO()
plt.savefig(line_monthly_trend_img, format='png')
line_monthly_trend_img.seek(0)
images['line_monthly_trend'] = base64.b64encode(line_monthly_trend_img.read()).decode('utf-8')
plt.close()

print("--- Line Chart (Monthly Trend) Generated (Base64 Encoded) ---")


# 15. What is the average claim amount for gender and age categories and suitably represent the above
# using a facetted bar chart, one facet that represents fraudulent claims and the other for non-fraudulent claims.
avg_claim_gender_age_fraud = combined_df.groupby(['gender', 'age_category', 'fraudulent_policy'])['claim_amount'].mean().reset_index()

# Map the binary 'fraudulent_policy' (0/1) to descriptive labels for the facets.
avg_claim_gender_age_fraud['fraud_status'] = avg_claim_gender_age_fraud['fraudulent_policy'].map({1: 'Fraudulent Claims', 0: 'Non-Fraudulent Claims'})

# Ensure age categories are ordered correctly for plotting in the facets.
avg_claim_gender_age_fraud['age_category'] = pd.Categorical(avg_claim_gender_age_fraud['age_category'], categories=age_order, ordered=True)
# Sort for consistent plotting order across facets.
avg_claim_gender_age_fraud = avg_claim_gender_age_fraud.sort_values(['fraud_status', 'gender', 'age_category'])

# Create the facetted bar chart using Seaborn's catplot.
g = sns.catplot(x='age_category', y='claim_amount', hue='gender', col='fraud_status',
                data=avg_claim_gender_age_fraud, kind='bar', height=6, aspect=1.2, palette='muted',
                order=age_order) # Use the predefined order for x-axis categories.
g.set_axis_labels("Age Category", "Average Claim Amount") # Set axis labels.
g.set_titles(col_template="Fraud Status: {col_name}") # Set titles for each facet.
g.fig.suptitle('Average Claim Amount by Gender, Age Category, and Fraud Status', y=1.02) # Main title for the entire figure.
plt.tight_layout()
facet_bar_img = BytesIO()
plt.savefig(facet_bar_img, format='png')
facet_bar_img.seek(0)
images['facet_bar'] = base64.b64encode(facet_bar_img.read()).decode('utf-8')
plt.close()

print("--- Facetted Bar Chart Generated (Base64 Encoded) ---")


# --- Section 6: Output Results ---
# This section prints the base64 encoded images, which can be embedded in HTML
# or Markdown to display the plots.

# import json
# print("\n--- Plot Images (Base64) ---")
# # Print the dictionary of base64 encoded images as a JSON string.
# # This allows for easy parsing and embedding of the images in the output.
# print(json.dumps(images))


# --- Section 7: Machine Learning Model for Fraud Prediction ---
# This section builds and evaluates machine learning models to predict 'fraudulent_policy'.

print("\n--- Section 7: Machine Learning Model for Fraud Prediction ---")

# Define features (X) and target (y)
# Drop columns that are IDs, dates, or directly related to the target in a way that would cause data leakage.
# Also drop columns that are not useful for prediction or are redundant after feature engineering.
X = combined_df.drop(columns=[
    'customer_id', 'claim_id', 'claim_date', 'DateOfBirth', 'fraudulent',
    'fraudulent_policy', 'Contact', 'claim_type' # 'claim_type' is used to derive 'claim_category'
])
y = combined_df['fraudulent_policy']

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}\n")

# Identify categorical and numerical features for preprocessing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=np.number).columns

print(f"Categorical features: {list(categorical_features)}")
print(f"Numerical features: {list(numerical_features)}\n")

# Create a preprocessing pipeline
# Numerical features will be scaled.
# Categorical features will be one-hot encoded.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}\n")

# --- Model 1: Logistic Regression ---
print("--- Training Logistic Regression Model ---")
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(solver='liblinear', random_state=42))])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
y_prob_lr = lr_pipeline.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_lr):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_lr):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# --- Model 2: Random Forest Classifier ---
print("\n--- Training Random Forest Classifier Model ---")
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print("\nRandom Forest Classifier Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_rf):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


# --- Hyperparameter Tuning (Random Forest Example) ---
print("\n--- Hyperparameter Tuning (Random Forest Example using GridSearchCV) ---")

# Define parameter grid for Random Forest
# Reduced number of parameters for quicker demonstration
param_grid_rf = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10],
    'classifier__min_samples_leaf': [1, 5]
}

# Use GridSearchCV for exhaustive search
grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

print(f"\nBest parameters found by GridSearchCV: {grid_search_rf.best_params_}")
print(f"Best ROC AUC score on training data: {grid_search_rf.best_score_:.4f}")

# Evaluate the best model on the test set
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
y_prob_best_rf = best_rf_model.predict_proba(X_test)[:, 1]

print("\nBest Random Forest Model Evaluation (after tuning):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_best_rf):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_best_rf):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_best_rf):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_best_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best_rf))

print("\n--- Machine Learning Section Complete ---")


# --- Section 8: Hypothesis Testing ---
# This section performs various hypothesis tests as requested.

print("\n--- Section 8: Hypothesis Testing ---")
hypothesis_results = {}
alpha = 0.05 # Significance level

# 16. Is there any similarity in the amount claimed by males and females?
# Parameters: claim_amount (numerical), gender (categorical, 2 groups)
# Test: Independent Two-Sample t-test
males_claims = combined_df[combined_df['gender'] == 'Male']['claim_amount']
females_claims = combined_df[combined_df['gender'] == 'Female']['claim_amount']

# Perform Levene's test for equality of variances
levene_stat, levene_p = stats.levene(males_claims, females_claims)
equal_var = True if levene_p > alpha else False # If p-value > alpha, assume equal variances

t_stat_gender, p_val_gender = stats.ttest_ind(males_claims, females_claims, equal_var=equal_var)

hypothesis_results['gender_claim_amount'] = {
    'test': 'Independent Two-Sample t-test',
    'H0': 'The average claim amount for males is equal to the average claim amount for females.',
    'H1': 'The average claim amount for males is not equal to the average claim amount for females.',
    'alpha': alpha,
    't_statistic': t_stat_gender,
    'p_value': p_val_gender,
    'equal_variance_assumed': equal_var
}
print(f"\n16. Similarity in amount claimed by males and females:")
print(f"   - T-statistic: {t_stat_gender:.4f}, P-value: {p_val_gender:.4f}")

# 17. Is there any relationship between age category and segment?
# Parameters: age_category (categorical), Segment (categorical)
# Test: Chi-squared test of independence
contingency_table_age_segment = pd.crosstab(combined_df['age_category'], combined_df['Segment'])
chi2_stat_age_segment, p_val_age_segment, dof_age_segment, expected_age_segment = stats.chi2_contingency(contingency_table_age_segment)

hypothesis_results['age_category_segment_relationship'] = {
    'test': 'Chi-squared test of independence',
    'H0': 'There is no relationship (independence) between age category and segment.',
    'H1': 'There is a relationship (dependence) between age category and segment.',
    'alpha': alpha,
    'chi2_statistic': chi2_stat_age_segment,
    'p_value': p_val_age_segment,
    'degrees_of_freedom': dof_age_segment
}
print(f"\n17. Relationship between age category and segment:")
print(f"   - Chi2-statistic: {chi2_stat_age_segment:.4f}, P-value: {p_val_age_segment:.4f}")

# 18. The current year has shown a significant rise in claim amounts as compared to 2016-17 fiscal average which was $10,000.
# Parameters: Average claim amount in 2018 (numerical), known population mean ($10,000)
# Test: One-Sample t-test (one-tailed)
# Assuming "current year" refers to claims in 2018.
claims_2018 = combined_df[combined_df['claim_date'].dt.year == 2018]['claim_amount']
pop_mean = 10000

# Perform one-sample t-test. 'alternative='greater'' for a one-tailed test (significant rise).
t_stat_2018, p_val_2018 = stats.ttest_1samp(claims_2018, pop_mean, alternative='greater')

hypothesis_results['2018_claim_rise'] = {
    'test': 'One-Sample t-test (one-tailed)',
    'H0': f'The average claim amount in 2018 is equal to or less than ${pop_mean}.',
    'H1': f'The average claim amount in 2018 is significantly greater than ${pop_mean}.',
    'alpha': alpha,
    't_statistic': t_stat_2018,
    'p_value': p_val_2018,
    'sample_mean_2018': claims_2018.mean()
}
print(f"\n18. Significant rise in claim amounts in 2018 vs. $10,000:")
print(f"   - Sample Mean (2018): {claims_2018.mean():.2f}")
print(f"   - T-statistic: {t_stat_2018:.4f}, P-value: {p_val_2018:.4f}")

# 19. Is there any difference between age groups and insurance claims?
# Parameters: claim_amount (numerical), age_category (categorical, >2 groups)
# Test: ANOVA (Analysis of Variance)
# Prepare data for ANOVA: list of claim amounts for each age category
age_groups_claims = [combined_df[combined_df['age_category'] == category]['claim_amount'] for category in age_order]

f_stat_anova, p_val_anova = stats.f_oneway(*age_groups_claims)

hypothesis_results['age_group_claim_difference'] = {
    'test': 'ANOVA (Analysis of Variance)',
    'H0': 'The average claim amounts are equal across all age groups.',
    'H1': 'At least one age group has a different average claim amount.',
    'alpha': alpha,
    'f_statistic': f_stat_anova,
    'p_value': p_val_anova
}
print(f"\n19. Difference between age groups and insurance claims:")
print(f"   - F-statistic: {f_stat_anova:.4f}, P-value: {p_val_anova:.4f}")

# 20. Is there any relationship between total number of policy claims and the claimed amount?
# Parameters: total_policy_claims (numerical), claim_amount (numerical)
# Test: Pearson Correlation
# Ensure 'total_policy_claims' is numeric. It was already imputed and converted in Section 3.
correlation_coeff, p_val_corr = stats.pearsonr(combined_df['total_policy_claims'], combined_df['claim_amount'])

hypothesis_results['policy_claims_claim_amount_relationship'] = {
    'test': 'Pearson Correlation Test',
    'H0': 'There is no linear relationship between total number of policy claims and the claimed amount.',
    'H1': 'There is a linear relationship between total number of policy claims and the claimed amount.',
    'alpha': alpha,
    'correlation_coefficient': correlation_coeff,
    'p_value': p_val_corr
}
print(f"\n20. Relationship between total number of policy claims and claimed amount:")
print(f"   - Pearson Correlation Coefficient: {correlation_coeff:.4f}, P-value: {p_val_corr:.4f}")

# Store hypothesis results for the markdown write-up
import json
print("\n--- Hypothesis Test Results (JSON) ---")
print(json.dumps(hypothesis_results, indent=2))


KeyError: 'customer_id'