In [4]:
import pandas as pd
import numpy as np
import random
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_insurance_leads(num_rows=10000):
    """Generate synthetic insurance lead data with realistic correlations"""
    
    # Define occupation categories with income ranges and conversion tendencies
    occupations = {
        'Engineer': {'income_range': (70000, 120000), 'conversion_boost': 0.1},
        'Doctor': {'income_range': (120000, 200000), 'conversion_boost': 0.15},
        'Lawyer': {'income_range': (80000, 150000), 'conversion_boost': 0.12},
        'Teacher': {'income_range': (45000, 75000), 'conversion_boost': 0.05},
        'Nurse': {'income_range': (55000, 85000), 'conversion_boost': 0.08},
        'Manager': {'income_range': (65000, 110000), 'conversion_boost': 0.1},
        'Clerk': {'income_range': (35000, 60000), 'conversion_boost': 0.02},
        'Technician': {'income_range': (45000, 75000), 'conversion_boost': 0.06},
        'Sales': {'income_range': (40000, 90000), 'conversion_boost': 0.07},
        'Consultant': {'income_range': (70000, 130000), 'conversion_boost': 0.11},
        'Accountant': {'income_range': (55000, 95000), 'conversion_boost': 0.09},
        'Analyst': {'income_range': (60000, 100000), 'conversion_boost': 0.08},
        'Retired': {'income_range': (25000, 60000), 'conversion_boost': -0.05},
        'Unemployed': {'income_range': (15000, 40000), 'conversion_boost': -0.15},
        'Student': {'income_range': (10000, 30000), 'conversion_boost': -0.1}
    }
    
    data = []
    
    for i in range(num_rows):
        lead_id = 1001 + i
        
        # Generate house age (0-50 years, weighted toward newer homes)
        house_age = int(np.random.gamma(2, 8))
        house_age = min(house_age, 50)  # Cap at 50 years
        
        # Generate occupation (ensure probabilities sum to exactly 1.0)
        occupation_list = list(occupations.keys())
        occupation_weights = [12, 8, 6, 15, 10, 12, 8, 8, 10, 5, 8, 6, 12, 4, 2]  # Sum = 126
        occupation_probs = [w/sum(occupation_weights) for w in occupation_weights]
        occupation = np.random.choice(occupation_list, p=occupation_probs)
        
        # Generate income based on occupation
        income_range = occupations[occupation]['income_range']
        income = int(np.random.normal((income_range[0] + income_range[1]) / 2, (income_range[1] - income_range[0]) / 6))
        income = max(income, 15000)  # Minimum income
        income = min(income, 250000)  # Maximum income
        
        # Generate house value (correlated with income and inversely with age)
        base_value = income * np.random.uniform(2.5, 4.5)
        age_depreciation = max(0.7, 1 - (house_age * 0.008))  # Houses depreciate with age
        house_value = int(base_value * age_depreciation)
        house_value = max(house_value, 80000)   # Minimum house value
        house_value = min(house_value, 800000)  # Maximum house value
        # Round to nearest 5000
        house_value = round(house_value / 5000) * 5000
        
        # Generate prior claims (0-5, weighted toward fewer claims)
        prior_claims = np.random.choice([0, 1, 2, 3, 4, 5], p=[0.5, 0.25, 0.15, 0.07, 0.02, 0.01])
        
        # Generate credit score (correlated with income and occupation)
        base_score = 600 + (income - 15000) * 0.001
        score_noise = np.random.normal(0, 30)
        credit_score = int(base_score + score_noise)
        credit_score = max(credit_score, 300)   # Minimum credit score
        credit_score = min(credit_score, 850)   # Maximum credit score
        # Round to nearest 10
        credit_score = round(credit_score / 10) * 10
        
        # Generate conversion probability based on multiple factors
        base_conversion_prob = 0.25  # Base 25% conversion rate
        
        # Occupation influence
        occupation_boost = occupations[occupation]['conversion_boost']
        
        # Income influence (higher income = higher conversion)
        income_boost = (income - 50000) / 200000 * 0.1
        
        # Credit score influence
        credit_boost = (credit_score - 600) / 250 * 0.1
        
        # Prior claims influence (more claims = lower conversion)
        claims_penalty = prior_claims * 0.08
        
        # House value influence (higher value = slightly higher conversion)
        house_value_boost = (house_value - 150000) / 500000 * 0.05
        
        # House age influence (newer homes = slightly higher conversion)
        age_penalty = house_age * 0.003
        
        # Calculate final probability
        conversion_prob = (base_conversion_prob + occupation_boost + income_boost + 
                         credit_boost - claims_penalty + house_value_boost - age_penalty)
        conversion_prob = max(0.02, min(0.85, conversion_prob))  # Keep between 2% and 85%
        
        # Generate conversion
        converted = 1 if np.random.random() < conversion_prob else 0
        
        data.append([
            lead_id, house_age, house_value, occupation, income, 
            prior_claims, credit_score, converted
        ])
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=[
        'lead_id', 'house_age', 'house_value', 'occupation', 'income', 
        'prior_claims', 'credit_score', 'converted'
    ])
    
    return df

# Generate the data
print("Generating 10,000 rows of insurance lead data...")
df = generate_insurance_leads(10000)

# Display basic statistics
print("\nData Generation Complete!")
print(f"Total rows: {len(df)}")
print(f"Conversion rate: {df['converted'].mean():.2%}")
print(f"Average income: ${df['income'].mean():,.0f}")
print(f"Average house value: ${df['house_value'].mean():,.0f}")

print("\nFirst 10 rows:")
print(df.head(10))

print("\nOccupation distribution:")
print(df['occupation'].value_counts().head(10))

print("\nConversion rate by occupation:")
conversion_by_occupation = df.groupby('occupation')['converted'].agg(['count', 'mean']).round(3)
conversion_by_occupation.columns = ['count', 'conversion_rate']
print(conversion_by_occupation.sort_values('conversion_rate', ascending=False))

# Save to CSV
filename = f'insurance_leads_data_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
df.to_csv(filename, index=False)
print(f"\nData saved to: {filename}")

# Optional: Display data quality checks
print("\nData Quality Checks:")
print(f"No missing values: {df.isnull().sum().sum() == 0}")
print(f"All lead_ids unique: {df['lead_id'].nunique() == len(df)}")
print(f"House age range: {df['house_age'].min()}-{df['house_age'].max()}")
print(f"Income range: ${df['income'].min():,}-${df['income'].max():,}")
print(f"Credit score range: {df['credit_score'].min()}-{df['credit_score'].max()}")
print(f"Prior claims range: {df['prior_claims'].min()}-{df['prior_claims'].max()}")

Generating 10,000 rows of insurance lead data...

Data Generation Complete!
Total rows: 10000
Conversion rate: 24.67%
Average income: $75,540
Average house value: $232,965

First 10 rows:
   lead_id  house_age  house_value  occupation  income  prior_claims  \
0     1001         19       140000  Technician   59308             0   
1     1002         22       180000     Analyst   76127             0   
2     1003          2       465000      Doctor  152502             0   
3     1004         11       155000  Technician   60611             0   
4     1005          8       220000     Manager   85312             0   
5     1006          4       205000       Clerk   48410             0   
6     1007         12       220000     Teacher   58494             0   
7     1008          2       390000    Engineer   99851             2   
8     1009          6       175000  Technician   58453             0   
9     1010          4       140000     Retired   36445             2   

   credit_score  co