In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("GENERATING SAMPLE INSURANCE DATA")
print("="*60)

# Set random seed for reproducibility
np.random.seed(42)

# Number of records
n_records = 1000  # Small sample (use 100,000+ for real training)

print(f"\nGenerating {n_records:,} sample records...")

# Create sample data
data = {
    # Personal Information
    'Age': np.random.randint(18, 80, n_records),
    'Gender': np.random.choice(['Male', 'Female'], n_records),
    'Marital Status': np.random.choice(['Single', 'Married', 'Divorced'], n_records),
    'Number of Dependents': np.random.randint(0, 6, n_records),
    'Education Level': np.random.choice(
        ["High School", "Bachelor's", "Master's", "PhD"], 
        n_records
    ),
    'Occupation': np.random.choice(
        ['Employed', 'Self-Employed', 'Unemployed'], 
        n_records
    ),
    
    # Financial Information
    'Annual Income': np.random.randint(20000, 200000, n_records),
    'Credit Score': np.random.randint(300, 850, n_records),
    
    # Health & Lifestyle
    'Health Score': np.random.randint(30, 100, n_records),
    'Smoking Status': np.random.choice(['Yes', 'No'], n_records),
    'Exercise Frequency': np.random.choice(
        ['Daily', 'Weekly', 'Monthly', 'Rarely'], 
        n_records
    ),
    
    # Policy Information
    'Policy Type': np.random.choice(['Basic', 'Comprehensive', 'Premium'], n_records),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_records),
    'Insurance Duration': np.random.randint(1, 21, n_records),
    'Vehicle Age': np.random.randint(0, 21, n_records),
    'Property Type': np.random.choice(['House', 'Apartment', 'Condo'], n_records),
    
    # Risk Factors
    'Previous Claims': np.random.randint(0, 6, n_records),
}

# Create DataFrame
df = pd.DataFrame(data)

base_premium = 2000

# Premium adjustments based on features
age_factor = (df['Age'] - 25) * 50  # Older = higher premium
income_factor = df['Annual Income'] / 100000 * 500  # Higher income = slightly higher
health_factor = (100 - df['Health Score']) * 30  # Lower health score = higher premium
credit_factor = (750 - df['Credit Score']) * 2  # Lower credit = higher premium
claims_factor = df['Previous Claims'] * 400  # More claims = higher premium

# Policy type multiplier
policy_multiplier = {
    'Basic': 0.8,
    'Comprehensive': 1.2,
    'Premium': 1.5
}
df['Policy_Multiplier'] = df['Policy Type'].map(policy_multiplier)

# Location multiplier
location_multiplier = {
    'Urban': 1.1,
    'Suburban': 1.0,
    'Rural': 0.9
}
df['Location_Multiplier'] = df['Location'].map(location_multiplier)

# Smoking multiplier
smoking_multiplier = {'Yes': 1.3, 'No': 1.0}
df['Smoking_Multiplier'] = df['Smoking Status'].map(smoking_multiplier)

# Calculate premium
df['Premium Amount'] = (
    base_premium + 
    age_factor + 
    income_factor + 
    health_factor + 
    credit_factor + 
    claims_factor
) * df['Policy_Multiplier'] * df['Location_Multiplier'] * df['Smoking_Multiplier']

# Add some randomness
df['Premium Amount'] += np.random.normal(0, 200, n_records)

# Ensure premium is positive
df['Premium Amount'] = df['Premium Amount'].clip(lower=500)

# Drop helper columns
df = df.drop(['Policy_Multiplier', 'Location_Multiplier', 'Smoking_Multiplier'], axis=1)

# Add some missing values to simulate real data (about 5%)
missing_percentage = 0.05
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Premium Amount':  # Don't add missing values to target
        missing_indices = np.random.choice(
            df.index, 
            size=int(len(df) * missing_percentage),
            replace=False
        )
        df.loc[missing_indices, col] = np.nan

print("\n Sample data generated successfully!")
print(f"  Shape: {df.shape}")
print(f"  Rows: {len(df):,}")
print(f"  Columns: {len(df.columns)}")

# Save to CSV
csv_filename = 'insurance_data.csv'
df.to_csv(csv_filename, index=False)
print(f"\n Saved as: {csv_filename}")

# Display info
print("\n" + "="*60)
print("DATA PREVIEW")
print("="*60)
print(df.head(10).to_string())

print("\n" + "="*60)
print("DATA STATISTICS")
print("="*60)
print(f"\nDataset Shape: {df.shape}")
print(f"\nNumeric Columns Summary:")
print(df.describe())

print(f"\nCategorical Columns Summary:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Values: {df[col].unique()}")

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nPremium Amount Range: ${df['Premium Amount'].min():.2f} - ${df['Premium Amount'].max():.2f}")
print(f"Average Premium: ${df['Premium Amount'].mean():.2f}")
print(f"Median Premium: ${df['Premium Amount'].median():.2f}")



GENERATING SAMPLE INSURANCE DATA

Generating 1,000 sample records...

 Sample data generated successfully!
  Shape: (1000, 18)
  Rows: 1,000
  Columns: 18

 Saved as: insurance_data.csv

DATA PREVIEW
    Age  Gender Marital Status  Number of Dependents Education Level     Occupation  Annual Income  Credit Score  Health Score Smoking Status Exercise Frequency    Policy Type  Location  Insurance Duration  Vehicle Age Property Type  Previous Claims  Premium Amount
0  56.0    Male       Divorced                   0.0        Master's  Self-Employed       193860.0         680.0          82.0             No            Monthly  Comprehensive     Rural                20.0          7.0         House              0.0     5725.562603
1  69.0    Male         Single                   1.0     High School       Employed       135945.0         417.0          37.0             No              Daily          Basic     Rural                 9.0         16.0     Apartment              3.0     6278.257379
2 

In [2]:
# TESTING CODE

print("\n" + "="*60)
print("TESTING SAMPLE DATA")
print("="*60)

# Test 1: Check all required features
required_features = [
    'Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents',
    'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type',
    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
    'Premium Amount', 'Smoking Status', 'Exercise Frequency', 'Property Type'
]

missing_features = [f for f in required_features if f not in df.columns]
if missing_features:
    print(f" Missing features: {missing_features}")
else:
    print(f" All {len(required_features)} required features present")

# Test 2: Check data types
print(f"\n Data types:")
for col, dtype in df.dtypes.items():
    print(f"  {col}: {dtype}")

# Test 3: Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n Duplicate rows: {duplicates}")

# Test 4: Premium distribution
print(f"\nPremium Amount Distribution:")
print(f"  Min: ${df['Premium Amount'].min():.2f}")
print(f"  Q1: ${df['Premium Amount'].quantile(0.25):.2f}")
print(f"  Median: ${df['Premium Amount'].median():.2f}")
print(f"  Q3: ${df['Premium Amount'].quantile(0.75):.2f}")
print(f"  Max: ${df['Premium Amount'].max():.2f}")

# Test 5: Categorical value counts
print(f"\nCategorical Feature Distributions:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\n{col}:")
    print(df[col].value_counts().to_string())

print("\n" + "="*60)
print("SAMPLE DATA READY FOR TRAINING!")
print("="*60)
print(f"\nYou can now use '{csv_filename}' with the model training script")
print("Usage:")
print("  1. Place this CSV in data/ folder")
print("  2. Run: python notebooks/02_model_training.py")
print("  3. Run: streamlit run app.py")




TESTING SAMPLE DATA
 All 18 required features present

 Data types:
  Age: float64
  Gender: object
  Marital Status: object
  Number of Dependents: float64
  Education Level: object
  Occupation: object
  Annual Income: float64
  Credit Score: float64
  Health Score: float64
  Smoking Status: object
  Exercise Frequency: object
  Policy Type: object
  Location: object
  Insurance Duration: float64
  Vehicle Age: float64
  Property Type: object
  Previous Claims: float64
  Premium Amount: float64

 Duplicate rows: 0

Premium Amount Distribution:
  Min: $2283.24
  Q1: $5704.34
  Median: $7624.91
  Q3: $9991.77
  Max: $19399.10

Categorical Feature Distributions:

Gender:
Gender
Male      523
Female    477

Marital Status:
Marital Status
Married     349
Single      337
Divorced    314

Education Level:
Education Level
High School    259
Master's       252
PhD            250
Bachelor's     239

Occupation:
Occupation
Employed         340
Unemployed       340
Self-Employed    320

Smoking

In [3]:
# BATCH TEST PREDICTIONS

print("\n" + "="*60)
print("QUICK MODEL TEST")
print("="*60)

# Sample test cases to verify model will work
test_cases = [
    {
        'Age': 25, 'Gender': 'Male', 'Annual Income': 40000,
        'Marital Status': 'Single', 'Number of Dependents': 0,
        'Education Level': "Bachelor's", 'Occupation': 'Employed',
        'Health Score': 85, 'Location': 'Urban', 'Policy Type': 'Basic',
        'Previous Claims': 0, 'Vehicle Age': 3, 'Credit Score': 750,
        'Insurance Duration': 2, 'Smoking Status': 'No',
        'Exercise Frequency': 'Weekly', 'Property Type': 'Apartment'
    },
    {
        'Age': 45, 'Gender': 'Female', 'Annual Income': 80000,
        'Marital Status': 'Married', 'Number of Dependents': 2,
        'Education Level': "Master's", 'Occupation': 'Employed',
        'Health Score': 65, 'Location': 'Suburban', 'Policy Type': 'Comprehensive',
        'Previous Claims': 1, 'Vehicle Age': 7, 'Credit Score': 680,
        'Insurance Duration': 5, 'Smoking Status': 'Yes',
        'Exercise Frequency': 'Monthly', 'Property Type': 'House'
    }
]

print("\nSample predictions (to verify model works):")
for i, test_case in enumerate(test_cases, 1):
    print(f"\nTest Case {i}:")
    print(f"  Age: {test_case['Age']}, Gender: {test_case['Gender']}")
    print(f"  Income: ${test_case['Annual Income']:,}")
    print(f"  Policy Type: {test_case['Policy Type']}")
    print(f"  Expected Premium Range: $2,500 - $7,500 (varies by model)")

print("\n" + "="*60)
print("TESTING COMPLETE!")
print("="*60)


QUICK MODEL TEST

Sample predictions (to verify model works):

Test Case 1:
  Age: 25, Gender: Male
  Income: $40,000
  Policy Type: Basic
  Expected Premium Range: $2,500 - $7,500 (varies by model)

Test Case 2:
  Age: 45, Gender: Female
  Income: $80,000
  Policy Type: Comprehensive
  Expected Premium Range: $2,500 - $7,500 (varies by model)

TESTING COMPLETE!
