In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_vendors = 100
n_pharmacies = 600
n_districts = 4000
districts_list = [f'D{d+1}' for d in range(n_districts)]

# --- 1. Medicines DataFrame (This would come from your Kaggle Dataset) ---
# You would load the A-Z dataset and *assign* categories.
meds_df_rows = [
    # Fever
    {'medicine_id': 'MED_101', 'name': 'Calpol 500mg', 'category': 'Fever', 'base_price': 30, 'preference': 2},
    {'medicine_id': 'MED_102', 'name': 'Ibuprofen 400mg', 'category': 'Fever', 'base_price': 45, 'preference': 1},
    {'medicine_id': 'MED_103', 'name': 'Meftal 250mg', 'category': 'Fever', 'base_price': 25, 'preference': 3},
    
    # Stomach
    {'medicine_id': 'MED_201', 'name': 'Ridol 50mg', 'category': 'Stomach', 'base_price': 50, 'preference': 1},
    {'medicine_id': 'MED_202', 'name': 'Unienzyme', 'category': 'Stomach', 'base_price': 35, 'preference': 2},
    {'medicine_id': 'MED_203', 'name': 'Pudin Hara', 'category': 'Stomach', 'base_price': 20, 'preference': 3},
    
    # Toothache
    {'medicine_id': 'MED_301', 'name': 'Ketorol DT', 'category': 'Toothache', 'base_price': 60, 'preference': 1},
    {'medicine_id': 'MED_302', 'name': 'Clove Oil', 'category': 'Toothache', 'base_price': 40, 'preference': 2}
]
meds_df = pd.DataFrame(meds_df_rows)
all_meds = meds_df['medicine_id'].unique()

# --- 2. Vendors DataFrame (Simpler: Who and Where) ---
vendors = pd.DataFrame({
    'vendor_id': range(1, n_vendors + 1),
    'name': [f'Vendor_{i}' for i in range(1, n_vendors + 1)],
    'lat': 19.0 + np.random.normal(scale=0.1, size=n_vendors),
    'lon': 72.8 + np.random.normal(scale=0.1, size=n_vendors),
    # Delivery time is per vendor, not per med
    'base_delivery_time_hr': np.random.randint(12, 72, n_vendors) 
})

# --- 3. Vendor Inventory (The NEW Long-Format File) ---
# This is the "bridge" file.
inventory_rows = []
for vendor_id in vendors['vendor_id']:
    for med_id, base_price in meds_df[['medicine_id', 'base_price']].values:
        # 70% chance a vendor stocks a given med
        if np.random.rand() > 0.3:
            inventory_rows.append({
                'vendor_id': vendor_id,
                'medicine_id': med_id,
                'stock': np.random.randint(50, 200),
                # Cost is a discount off the base_price
                'unit_cost': np.random.uniform(base_price * 0.8, base_price * 1.1)
            })

vendor_inventory = pd.DataFrame(inventory_rows)

# --- 4. Pharmacies (Unchanged) ---
pharmacies = pd.DataFrame({
    'pharmacy_id': range(1, n_pharmacies+1),
    'lat': 19.0 + np.random.normal(scale=0.15, size=n_pharmacies),
    'lon': 72.8 + np.random.normal(scale=0.15, size=n_pharmacies),
    'district_id': np.random.choice(districts_list, n_pharmacies),
})

# --- 5. Demand History (Unchanged) ---
dates = pd.date_range(start='2025-01-01', periods=12, freq='M')
demand_hist = []
for d in districts_list:
    for med in all_meds:
        for dt in dates:
            base = np.random.randint(5, 25)
            seasonal = 1.0 + 0.2 * np.sin((dt.month-1)/12.0 * 2*np.pi)
            demand_hist.append({'district_id': d, 'medicine_id': med, 'period': dt, 'demand': int(base * seasonal)})
demand_df = pd.DataFrame(demand_hist)

# --- Save Files ---
meds_df.to_csv('meds_real.csv', index=False)
vendors.to_csv('vendors_real.csv', index=False)
vendor_inventory.to_csv('vendor_inventory_real.csv', index=False)
pharmacies.to_csv('pharmacies_real.csv', index=False)
demand_df.to_csv('demand_real.csv', index=False)

print("Synthetic data (v3) with REALISTIC long-format inventory generated.")
