In [1]:
#Data Generation Logic

import pandas as pd
import numpy as np
from datetime import datetime, timedelta


NUM_RECORDS = 50000
FILENAME = 'nova_custom_biased_dataset.csv'
np.random.seed(42) 

def calculate_loyalty_score(partner_since_date, trip_frequency):
    """Creates a loyalty score normalized to [0, 1]."""
    days_as_partner = (datetime.now() - partner_since_date).days
    tenure_score = 1 - np.exp(-days_as_partner / 1000)  
    trips_score = 1 - np.exp(-trip_frequency / 200)      
    return round(0.6 * tenure_score + 0.4 * trips_score, 3)

data = []
partner_types = ['Driver', 'Merchant']
driver_vehicle_types = ['Cab Driver', 'Food Delivery Rider']
merchant_categories = ['Small', 'Medium', 'Large']

for i in range(NUM_RECORDS):
    gender = np.random.choice(['Male', 'Female'], p=[0.85, 0.15])
    partner_type = np.random.choice(partner_types, p=[0.7, 0.3])
    
    record = {'gender': gender, 'partner_type': partner_type}
    
    if partner_type == 'Driver':
        record['vehicle_type'] = np.random.choice(driver_vehicle_types, p=[0.6, 0.4])
        days_ago = np.random.randint(30, 365 * 4)
        record['partner_since_date'] = datetime.now() - timedelta(days=days_ago)
        record['geographic_location'] = np.random.choice(['Urban', 'Rural'], p=[0.75, 0.25])
        
        if record['geographic_location'] == 'Urban':
            record['shift'] = np.random.choice(['Day', 'Night'], p=[0.4, 0.6])
        else:
            record['shift'] = np.random.choice(['Day', 'Night'], p=[0.8, 0.2])
        
        if record['vehicle_type'] == 'Cab Driver':
            base_rate_range = (18, 25)
        else:
            base_rate_range = (12, 18)

        if record['geographic_location'] == 'Rural':
            base_rate_range = tuple(x * 0.8 for x in base_rate_range)

        if gender == 'Female':
            base_rate_range = tuple(x * 0.9 for x in base_rate_range)

        if record['shift'] == 'Night':
            base_rate_range = tuple(x * 1.2 for x in base_rate_range)

        hours_online = np.random.uniform(100, 250)
        earnings_per_hour = round(np.random.uniform(*base_rate_range), 2)
        total_earnings = hours_online * earnings_per_hour

        record['earnings_per_hour'] = earnings_per_hour
        record['total_earnings'] = round(total_earnings, 2)
        record['hours_online'] = int(round(hours_online, 0))

        trip_freq = np.random.uniform(80, 200)
        if record['geographic_location'] == 'Rural':
            trip_freq *= 0.9
        record['trip_frequency'] = int(round(trip_freq, 0))

        record['num_insurance_claims'] = np.random.choice([0, 1, 2, 3, 4, 5], p=[0.7, 0.15, 0.05, 0.05, 0.025, 0.025])
        
        if record['num_insurance_claims'] <= 1:
            rating = np.random.uniform(4.5, 5.0) 
        elif record['num_insurance_claims'] <= 3:
            rating = np.random.uniform(4.0, 4.7)
        else:
            rating = np.random.uniform(3.5, 4.2) 
        record['customer_rating'] = round(rating, 1)

        weekly_earnings = np.random.normal(
            loc=total_earnings / 4, 
            scale=total_earnings / 20, 
            size=12
        )
        record['consistency_index'] = round(np.std(weekly_earnings), 2)

        record['loyalty_score'] = calculate_loyalty_score(record['partner_since_date'], trip_freq)
    
    #Merchants
    else:
        record['merchant_category'] = np.random.choice(merchant_categories, p=[0.6, 0.3, 0.1])
        record['geographic_location'] = np.random.choice(['Urban', 'Rural'], p=[0.8, 0.2])
        
        if record['merchant_category'] == 'Small':
            base_gmv = np.random.uniform(50_000, 150_000)
        elif record['merchant_category'] == 'Medium':
            base_gmv = np.random.uniform(200_000, 500_000)
        else:
            base_gmv = np.random.uniform(600_000, 2_000_000)
        
        gmv = base_gmv
        if gender == 'Female':
            gmv *= 0.9
        if record['geographic_location'] == 'Rural':
            gmv *= 0.7
            
        record['gross_merchandise_volume'] = round(gmv, 2)
        record['average_order_value'] = round(gmv / np.random.randint(500, 5000), 2)
        record['sales_growth'] = round(np.random.uniform(-0.05, 0.20), 2)

        monthly_revenues = []
        for _ in range(12):
            month_rev = gmv / 12 * np.random.uniform(0.8, 1.2)
            monthly_revenues.append(month_rev)
        coef_var = np.std(monthly_revenues) / np.mean(monthly_revenues)
        record['revenue_stability_score'] = round(1 - min(coef_var / 0.6, 1), 2)

        record['store_availability'] = round(np.random.uniform(0.85, 1.0), 2)
        record['preparation_speed'] = round(np.random.uniform(10, 25), 1)
        record['order_accuracy'] = round(np.random.uniform(0.95, 1.0), 3)
        record['customer_retention'] = round(np.random.uniform(0.4, 0.8), 2)
    
    data.append(record)

df = pd.DataFrame(data)
df.to_csv(FILENAME, index=False)

print(f"Successfully generated biased dataset with {len(df)} records.")
print(f"File saved as '{FILENAME}'")

print("\n--- Verifying Biases ---")
print("Avg Earnings Per Hour by Gender (Drivers):")
print(df[df['partner_type'] == 'Driver'].groupby('gender')['earnings_per_hour'].mean().round(2))
print("\nAvg Earnings Per Hour by Location (Drivers):")
print(df[df['partner_type'] == 'Driver'].groupby('geographic_location')['earnings_per_hour'].mean().round(2))
print("\nAvg GMV by Location (Merchants):")
print(df[df['partner_type'] == 'Merchant'].groupby('geographic_location')['gross_merchandise_volume'].mean().round(2))

print("\nSample:")
display(df.head())

Successfully generated biased dataset with 50000 records.
File saved as 'nova_custom_biased_dataset.csv'

--- Verifying Biases ---
Avg Earnings Per Hour by Gender (Drivers):
gender
Female    17.88
Male      19.88
Name: earnings_per_hour, dtype: float64

Avg Earnings Per Hour by Location (Drivers):
geographic_location
Rural    15.56
Urban    20.91
Name: earnings_per_hour, dtype: float64

Avg GMV by Location (Merchants):
geographic_location
Rural    202403.53
Urban    287205.88
Name: gross_merchandise_volume, dtype: float64

Sample:


Unnamed: 0,gender,partner_type,merchant_category,geographic_location,gross_merchandise_volume,average_order_value,sales_growth,revenue_stability_score,store_availability,preparation_speed,...,partner_since_date,shift,earnings_per_hour,total_earnings,hours_online,trip_frequency,num_insurance_claims,customer_rating,consistency_index,loyalty_score
0,Male,Merchant,Medium,Urban,246805.59,255.49,-0.03,0.77,0.85,17.9,...,NaT,,,,,,,,,
1,Female,Driver,,Urban,,,,,,,...,2022-12-17 12:07:05.657118,Night,19.79,3737.58,189.0,153.0,0.0,4.5,130.46,0.59
2,Male,Driver,,Rural,,,,,,,...,2023-08-29 12:07:05.657118,Day,19.59,3120.15,159.0,151.0,0.0,4.8,175.39,0.523
3,Male,Driver,,Urban,,,,,,,...,2024-07-05 12:07:05.657118,Night,24.61,2734.34,111.0,94.0,2.0,4.4,119.83,0.356
4,Male,Driver,,Urban,,,,,,,...,2022-09-27 12:07:05.657118,Night,29.12,3793.11,130.0,137.0,0.0,4.8,153.07,0.592
