In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# LOAD DATA

print("Loading data...")
df = pd.read_csv('insurance_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:\n{df.head()}")


Loading data...
Dataset shape: (5000, 18)

First few rows:
   Age  Gender  Annual Income Marital Status  Number of Dependents  \
0   56    Male          59263        Married                     1   
1   69  Female         128491         Single                     0   
2   46  Female          75087       Divorced                     3   
3   32  Female         193353        Married                     1   
4   60  Female          71482        Married                     4   

  Education Level     Occupation  Health Score  Location    Policy Type  \
0             PhD       Employed            93  Suburban          Basic   
1      Bachelor's       Employed            43     Rural          Basic   
2             PhD     Unemployed            52  Suburban  Comprehensive   
3             PhD       Employed            77     Rural  Comprehensive   
4        Master's  Self-Employed            57  Suburban        Premium   

   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \


In [3]:
# 1. UNIVARIATE ANALYSIS - NUMERICAL FEATURES

print("\n" + "="*60)
print("1. UNIVARIATE ANALYSIS - NUMERICAL FEATURES")
print("="*60)

numerical_cols = df.select_dtypes(include=[np.number]).columns
fig, axes = plt.subplots(5, 4, figsize=(16, 14))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('01_numerical_distributions.png', dpi=300, bbox_inches='tight')
print("Saved: 01_numerical_distributions.png")
plt.close()


1. UNIVARIATE ANALYSIS - NUMERICAL FEATURES
Saved: 01_numerical_distributions.png


In [4]:
# 2. TARGET VARIABLE ANALYSIS

print("\n" + "="*60)
print("2. TARGET VARIABLE ANALYSIS")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution of Premium Amount
axes[0, 0].hist(df['Premium Amount'].dropna(), bins=50, color='green', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Distribution of Premium Amount', fontweight='bold', fontsize=12)
axes[0, 0].set_xlabel('Premium Amount ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['Premium Amount'].mean(), color='red', linestyle='--', label=f'Mean: ${df["Premium Amount"].mean():.2f}')
axes[0, 0].legend()

# Box plot of Premium Amount
axes[0, 1].boxplot(df['Premium Amount'].dropna())
axes[0, 1].set_title('Box Plot of Premium Amount', fontweight='bold', fontsize=12)
axes[0, 1].set_ylabel('Premium Amount ($)')
axes[0, 1].grid(alpha=0.3)

# Log transformation
axes[1, 0].hist(np.log1p(df['Premium Amount'].dropna()), bins=50, color='purple', alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Log-Transformed Premium Amount', fontweight='bold', fontsize=12)
axes[1, 0].set_xlabel('Log(Premium Amount)')
axes[1, 0].set_ylabel('Frequency')

# Statistics
stats_text = f"""
Premium Amount Statistics:
─────────────────────────
Mean: ${df['Premium Amount'].mean():,.2f}
Median: ${df['Premium Amount'].median():,.2f}
Std Dev: ${df['Premium Amount'].std():,.2f}
Min: ${df['Premium Amount'].min():,.2f}
Max: ${df['Premium Amount'].max():,.2f}
Skewness: {df['Premium Amount'].skew():.3f}
"""
axes[1, 1].text(0.1, 0.5, stats_text, fontsize=11, family='monospace',
               verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
axes[1, 1].axis('off')

plt.tight_layout()
plt.savefig('02_target_analysis.png', dpi=300, bbox_inches='tight')
print("Saved: 02_target_analysis.png")
plt.close()


2. TARGET VARIABLE ANALYSIS
Saved: 02_target_analysis.png


In [5]:
# 3. CATEGORICAL FEATURES ANALYSIS

print("\n" + "="*60)
print("3. CATEGORICAL FEATURES ANALYSIS")
print("="*60)

categorical_cols = df.select_dtypes(include=['object']).columns

fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols[:9]):
    value_counts = df[col].value_counts()
    axes[idx].bar(range(len(value_counts)), value_counts.values, color='coral', edgecolor='black')
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_ylabel('Count')
    axes[idx].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('03_categorical_distributions.png', dpi=300, bbox_inches='tight')
print("Saved: 03_categorical_distributions.png")
plt.close()



3. CATEGORICAL FEATURES ANALYSIS
Saved: 03_categorical_distributions.png


In [6]:
# 4. CORRELATION ANALYSIS

print("\n" + "="*60)
print("4. CORRELATION ANALYSIS")
print("="*60)

# Select only numerical columns
numerical_df = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()

# Correlation with target
target_corr = correlation_matrix['Premium Amount'].sort_values(ascending=False)
print("\nCorrelation with Premium Amount:")
print(target_corr)

# Correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Insurance Dataset', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.savefig('04_correlation_heatmap.png', dpi=300, bbox_inches='tight')
print("Saved: 04_correlation_heatmap.png")
plt.close()

# Top correlations with target
fig, ax = plt.subplots(figsize=(10, 6))
top_corr = target_corr.drop('Premium Amount')[:-1]  # Exclude self-correlation
colors = ['green' if x > 0 else 'red' for x in top_corr.values]
ax.barh(range(len(top_corr)), top_corr.values, color=colors, alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(top_corr)))
ax.set_yticklabels(top_corr.index)
ax.set_xlabel('Correlation Coefficient')
ax.set_title('Feature Correlation with Premium Amount', fontweight='bold', fontsize=12)
ax.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('05_target_correlation_top.png', dpi=300, bbox_inches='tight')
print("Saved: 05_target_correlation_top.png")
plt.close()



4. CORRELATION ANALYSIS

Correlation with Premium Amount:
Premium Amount          1.000000
Age                     0.383229
Previous Claims         0.313222
Annual Income           0.118725
Vehicle Age             0.003084
Number of Dependents    0.001864
Insurance Duration     -0.008021
Credit Score           -0.121157
Health Score           -0.273080
Name: Premium Amount, dtype: float64
Saved: 04_correlation_heatmap.png
Saved: 05_target_correlation_top.png


In [7]:
# 5. RELATIONSHIP BETWEEN KEY FEATURES AND TARGET

print("\n" + "="*60)
print("5. KEY FEATURE vs TARGET RELATIONSHIPS")
print("="*60)

fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Age vs Premium
axes[0, 0].scatter(df['Age'], df['Premium Amount'], alpha=0.5, s=20, color='blue')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Premium Amount ($)')
axes[0, 0].set_title('Age vs Premium Amount', fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# Income vs Premium
axes[0, 1].scatter(df['Annual Income'], df['Premium Amount'], alpha=0.5, s=20, color='green')
axes[0, 1].set_xlabel('Annual Income ($)')
axes[0, 1].set_ylabel('Premium Amount ($)')
axes[0, 1].set_title('Income vs Premium Amount', fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Health Score vs Premium
axes[0, 2].scatter(df['Health Score'], df['Premium Amount'], alpha=0.5, s=20, color='red')
axes[0, 2].set_xlabel('Health Score')
axes[0, 2].set_ylabel('Premium Amount ($)')
axes[0, 2].set_title('Health Score vs Premium Amount', fontweight='bold')
axes[0, 2].grid(alpha=0.3)

# Premium Amount by Policy Type
policy_premium = df.groupby('Policy Type')['Premium Amount'].mean().sort_values()
axes[1, 0].bar(policy_premium.index, policy_premium.values, color='orange', alpha=0.7, edgecolor='black')
axes[1, 0].set_ylabel('Average Premium Amount ($)')
axes[1, 0].set_title('Average Premium by Policy Type', fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='y')

# Premium Amount by Location
location_premium = df.groupby('Location')['Premium Amount'].mean().sort_values()
axes[1, 1].bar(location_premium.index, location_premium.values, color='purple', alpha=0.7, edgecolor='black')
axes[1, 1].set_ylabel('Average Premium Amount ($)')
axes[1, 1].set_title('Average Premium by Location', fontweight='bold')
axes[1, 1].grid(alpha=0.3, axis='y')

# Premium Amount by Smoking Status
smoking_premium = df.groupby('Smoking Status')['Premium Amount'].mean().sort_values()
axes[1, 2].bar(smoking_premium.index, smoking_premium.values, color='brown', alpha=0.7, edgecolor='black')
axes[1, 2].set_ylabel('Average Premium Amount ($)')
axes[1, 2].set_title('Average Premium by Smoking Status', fontweight='bold')
axes[1, 2].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('06_feature_target_relationships.png', dpi=300, bbox_inches='tight')
print("Saved: 06_feature_target_relationships.png")
plt.close()



5. KEY FEATURE vs TARGET RELATIONSHIPS
Saved: 06_feature_target_relationships.png


In [8]:
# 6. MISSING VALUES ANALYSIS

print("\n" + "="*60)
print("6. MISSING VALUES ANALYSIS")
print("="*60)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
}).sort_values('Percentage', ascending=False)

print("\nMissing Values Summary:")
print(missing_df)

# Visualize missing values
fig, ax = plt.subplots(figsize=(12, 6))
missing_percent_plot = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
missing_percent_plot = missing_percent_plot[missing_percent_plot > 0]

if len(missing_percent_plot) > 0:
    ax.barh(range(len(missing_percent_plot)), missing_percent_plot.values, color='salmon', alpha=0.7, edgecolor='black')
    ax.set_yticks(range(len(missing_percent_plot)))
    ax.set_yticklabels(missing_percent_plot.index)
    ax.set_xlabel('Percentage of Missing Values (%)')
    ax.set_title('Missing Values in Dataset', fontweight='bold', fontsize=12)
    ax.grid(alpha=0.3, axis='x')
else:
    ax.text(0.5, 0.5, 'No Missing Values Found!', horizontalalignment='center',
            verticalalignment='center', fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

plt.tight_layout()
plt.savefig('07_missing_values.png', dpi=300, bbox_inches='tight')
print("Saved: 07_missing_values.png")
plt.close()


6. MISSING VALUES ANALYSIS

Missing Values Summary:
                      Missing Count  Percentage
Age                               0         0.0
Gender                            0         0.0
Annual Income                     0         0.0
Marital Status                    0         0.0
Number of Dependents              0         0.0
Education Level                   0         0.0
Occupation                        0         0.0
Health Score                      0         0.0
Location                          0         0.0
Policy Type                       0         0.0
Previous Claims                   0         0.0
Vehicle Age                       0         0.0
Credit Score                      0         0.0
Insurance Duration                0         0.0
Smoking Status                    0         0.0
Exercise Frequency                0         0.0
Property Type                     0         0.0
Premium Amount                    0         0.0
Saved: 07_missing_values.png


In [10]:
# 7. DATA QUALITY SUMMARY

print("\n" + "="*60)
print("7. DATA QUALITY SUMMARY")
print("="*60)

quality_report = f"""
DATA QUALITY REPORT

Dataset Dimensions:
  • Total Records: {len(df):,}
  • Total Features: {len(df.columns)}
  • Numerical Features: {len(df.select_dtypes(include=[np.number]).columns)}
  • Categorical Features: {len(df.select_dtypes(include=['object']).columns)}

Data Type Distribution:
  • Numerical: {len(df.select_dtypes(include=[np.number]).columns)} columns
  • Categorical: {len(df.select_dtypes(include=['object']).columns)} columns

Missing Data:
  • Total Missing Values: {df.isnull().sum().sum():,}
  • Percentage: {(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100):.2f}%
  • Features with Missing Values: {(df.isnull().sum() > 0).sum()}

Target Variable (Premium Amount):
  • Mean: ${df['Premium Amount'].mean():,.2f}
  • Median: ${df['Premium Amount'].median():,.2f}
  • Std Dev: ${df['Premium Amount'].std():,.2f}
  • Min: ${df['Premium Amount'].min():,.2f}
  • Max: ${df['Premium Amount'].max():,.2f}
  • Skewness: {df['Premium Amount'].skew():.3f}

Outliers (Using IQR method):
  • Premium Amount Outliers: {((df['Premium Amount'] > df['Premium Amount'].quantile(0.75) + 1.5*(df['Premium Amount'].quantile(0.75) - df['Premium Amount'].quantile(0.25))) | (df['Premium Amount'] < df['Premium Amount'].quantile(0.25) - 1.5*(df['Premium Amount'].quantile(0.75) - df['Premium Amount'].quantile(0.25)))).sum():,}

Data Completeness: {((1 - (df.isnull().sum().sum() / (len(df) * len(df.columns)))) * 100):.2f}%
"""

print(quality_report)

# Save report
with open('data_quality_report.txt', 'w') as f:
    f.write(quality_report)
print("Saved: data_quality_report.txt")



7. DATA QUALITY SUMMARY

DATA QUALITY REPORT

Dataset Dimensions:
  • Total Records: 5,000
  • Total Features: 18
  • Numerical Features: 9
  • Categorical Features: 9

Data Type Distribution:
  • Numerical: 9 columns
  • Categorical: 9 columns

Missing Data:
  • Total Missing Values: 0
  • Percentage: 0.00%
  • Features with Missing Values: 0

Target Variable (Premium Amount):
  • Mean: $8,260.22
  • Median: $7,907.80
  • Std Dev: $3,050.95
  • Min: $2,186.95
  • Max: $20,139.02
  • Skewness: 0.635

Outliers (Using IQR method):
  • Premium Amount Outliers: 44

Data Completeness: 100.00%

Saved: data_quality_report.txt


In [11]:
# 8. FEATURE STATISTICS

print("\n" + "="*60)
print("8. DETAILED FEATURE STATISTICS")
print("="*60)

print("\nNumerical Features Summary:")
print(df[numerical_cols].describe().to_string())

print("\n\nCategorical Features Summary:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())



8. DETAILED FEATURE STATISTICS

Numerical Features Summary:
               Age  Annual Income  Number of Dependents  Health Score  Previous Claims  Vehicle Age  Credit Score  Insurance Duration  Premium Amount
count  5000.000000    5000.000000           5000.000000   5000.000000      5000.000000  5000.000000   5000.000000         5000.000000     5000.000000
mean     48.805600  109104.693000              2.483200     64.483600         2.452400     9.997000    567.783400           10.410600     8260.217333
std      17.906991   51560.773676              1.726822     20.066855         1.697619     6.079505    157.666657            5.725536     3050.953691
min      18.000000   20028.000000              0.000000     30.000000         0.000000     0.000000    300.000000            1.000000     2186.946000
25%      34.000000   64424.250000              1.000000     47.000000         1.000000     5.000000    430.750000            5.000000     5897.805405
50%      49.000000  108140.500000      

In [12]:
# SUMMARY
print("\n" + "="*60)
print("EDA COMPLETE!")
print("="*60)
print("""
Generated Visualizations:
  1. 01_numerical_distributions.png
  2. 02_target_analysis.png
  3. 03_categorical_distributions.png
  4. 04_correlation_heatmap.png
  5. 05_target_correlation_top.png
  6. 06_feature_target_relationships.png
  7. 07_missing_values.png
  
Reports:
  • data_quality_report.txt

Next Steps:
  1. Review visualizations to understand data patterns
  2. Identify features with highest correlation to target
  3. Plan data preprocessing steps
  4. Train ML models
  5. Evaluate and deploy model
""")


EDA COMPLETE!

Generated Visualizations:
  1. 01_numerical_distributions.png
  2. 02_target_analysis.png
  3. 03_categorical_distributions.png
  4. 04_correlation_heatmap.png
  5. 05_target_correlation_top.png
  6. 06_feature_target_relationships.png
  7. 07_missing_values.png
  
Reports:
  • data_quality_report.txt

Next Steps:
  1. Review visualizations to understand data patterns
  2. Identify features with highest correlation to target
  3. Plan data preprocessing steps
  4. Train ML models
  5. Evaluate and deploy model

