<a href="https://colab.research.google.com/github/rickycircelli/ai-credit-risk-model/blob/main/data_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Setup
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Number of users to simulate
num_users = 500
user_ids = np.arange(1, num_users + 1)

In [None]:
# Step 2: Simulate employment alt-data
def simulate_employment_data(n):
    years_at_job = np.random.randint(0, 10, n)
    employer_tier = np.random.choice(['Startup', 'Mid-Tier', 'FAANG'], size=n, p=[0.5, 0.3, 0.2])

    # Define scoring weights
    tier_score_map = {'Startup': 1.0, 'Mid-Tier': 1.5, 'FAANG': 2.0}
    job_stability_score = [years_at_job[i] * tier_score_map[employer_tier[i]] for i in range(n)]

    return pd.DataFrame({
        'user_id': user_ids,
        'years_at_job': years_at_job,
        'employer_tier': employer_tier,
        'job_stability_score': job_stability_score
    })

# Generate and preview
employment_df = simulate_employment_data(num_users)
employment_df.head()


Unnamed: 0,user_id,years_at_job,employer_tier,job_stability_score
0,1,6,Startup,6.0
1,2,3,FAANG,6.0
2,3,7,Mid-Tier,10.5
3,4,4,FAANG,8.0
4,5,6,Startup,6.0


In [None]:
# Step 3: Simulate Rent + Utility Alt-Data
def simulate_rent_data(n):
    rent_on_time_rate = np.round(np.random.normal(loc=0.93, scale=0.05, size=n), 2)
    rent_on_time_rate = np.clip(rent_on_time_rate, 0.5, 1.0)

    missed_rent_payments = (12 * (1 - rent_on_time_rate)).astype(int)

    avg_utility_bill = np.round(np.random.normal(loc=180, scale=30, size=n), 2)
    avg_utility_bill = np.clip(avg_utility_bill, 60, 400)

    late_payment_flag = (missed_rent_payments > 2).astype(int)

    return pd.DataFrame({
        'user_id': user_ids,
        'rent_on_time_rate': rent_on_time_rate,
        'missed_rent_payments': missed_rent_payments,
        'avg_utility_bill': avg_utility_bill,
        'late_payment_flag': late_payment_flag
    })

# Generate and preview
rent_df = simulate_rent_data(num_users)
rent_df.head()


Unnamed: 0,user_id,rent_on_time_rate,missed_rent_payments,avg_utility_bill,late_payment_flag
0,1,0.95,0,179.41,0
1,2,0.87,1,196.57,0
2,3,0.98,0,186.72,0
3,4,0.92,0,220.92,0
4,5,0.9,1,183.76,0


In [None]:
# Step 4: Simulate Social Media Alt-Data
def simulate_social_data(n):
    # Sentiment score follows a normal distribution centered around neutral
    sentiment_score = np.round(np.random.normal(loc=0.1, scale=0.4, size=n), 2)
    sentiment_score = np.clip(sentiment_score, -1.0, 1.0)

    # Emoji usage: more expressive users
    emoji_usage_rate = np.round(np.random.beta(a=2, b=5, size=n) * 10, 2)

    # Posting activity
    posts_per_week = np.random.poisson(lam=5, size=n)
    posts_per_week = np.clip(posts_per_week, 0, 15)

    return pd.DataFrame({
        'user_id': user_ids,
        'sentiment_score': sentiment_score,
        'emoji_usage_rate': emoji_usage_rate,
        'posts_per_week': posts_per_week
    })

# Generate and preview
social_df = simulate_social_data(num_users)
social_df.head()


Unnamed: 0,user_id,sentiment_score,emoji_usage_rate,posts_per_week
0,1,-0.3,3.26,2
1,2,-0.17,5.52,5
2,3,0.31,1.1,8
3,4,0.17,2.17,7
4,5,0.24,0.78,7


In [None]:
# Step 5: Simulate user-level data sharing consent
def simulate_user_consent(n, p_employment=0.95, p_rent=0.9, p_social=0.8):
    consent_employment = np.random.choice([1, 0], size=n, p=[p_employment, 1 - p_employment])
    consent_rent = np.random.choice([1, 0], size=n, p=[p_rent, 1 - p_rent])
    consent_social = np.random.choice([1, 0], size=n, p=[p_social, 1 - p_social])

    return pd.DataFrame({
        'user_id': user_ids,
        'consent_employment': consent_employment,
        'consent_rent': consent_rent,
        'consent_social': consent_social
    })

# Generate and preview
consent_df = simulate_user_consent(num_users)
consent_df.head()


Unnamed: 0,user_id,consent_employment,consent_rent,consent_social
0,1,0,1,1
1,2,1,1,1
2,3,1,1,1
3,4,1,1,0
4,5,1,1,0


In [None]:
# Step 6: Merge all simulated data and consent columns
# Merge on user_id (all dfs have it)

# Step 1: merge feature sets
features_df = employment_df.merge(rent_df, on='user_id').merge(social_df, on='user_id')

# Step 2: add consent flags
final_df = features_df.merge(consent_df, on='user_id')

# Preview final output
final_df.head()


Unnamed: 0,user_id,years_at_job,employer_tier,job_stability_score,rent_on_time_rate,missed_rent_payments,avg_utility_bill,late_payment_flag,sentiment_score,emoji_usage_rate,posts_per_week,consent_employment,consent_rent,consent_social
0,1,6,Startup,6.0,0.95,0,179.41,0,-0.3,3.26,2,0,1,1
1,2,3,FAANG,6.0,0.87,1,196.57,0,-0.17,5.52,5,1,1,1
2,3,7,Mid-Tier,10.5,0.98,0,186.72,0,0.31,1.1,8,1,1,1
3,4,4,FAANG,8.0,0.92,0,220.92,0,0.17,2.17,7,1,1,0
4,5,6,Startup,6.0,0.9,1,183.76,0,0.24,0.78,7,1,1,0


In [None]:
# Save to CSV
final_df.to_csv('simulated_alt_data.csv', index=False)