In [1]:
import pandas as pd, numpy as np, random
np.random.seed(42); random.seed(42)

n_records=1000
weeks=pd.date_range("2024-01-01", periods=20, freq="W")
warehouses=["A","B","C","D"]
stores=[f"S{i:02d}" for i in range(1,13)]
carriers=["C1","C2","C3","C4"]
categories=["Grocery","Electronics","Clothing","Frozen"]

df=pd.DataFrame({
    "Shipment_ID":[f"SH{i:05d}" for i in range(n_records)],
    "Week":np.random.choice(weeks,n_records),
    "Warehouse_ID":np.random.choice(warehouses,n_records),
    "Store_ID":np.random.choice(stores,n_records),
    "Carrier":np.random.choice(carriers,n_records,p=[0.3,0.25,0.25,0.2]),
    "Product_Category":np.random.choice(categories,n_records,p=[0.4,0.25,0.2,0.15]),
    "Weight_kg":np.random.normal(15,6,n_records).round(2),
    "Delivery_Time_hrs":np.random.normal(28,5,n_records).round(1),
    "Fuel_Price_EUR_per_litre":np.random.normal(1.7,0.1,n_records).round(2),
    "Temperature_C":np.random.normal(5,2,n_records).round(1),
    "Promo_Flag":np.random.choice([0,1],n_records,p=[0.8,0.2]),
    "On_Time_Flag":np.random.choice([0,1],n_records,p=[0.85,0.15]),
    "Weekly_Sales_EUR":np.random.normal(52000,15000,n_records).round(2),
    "Notes":np.random.choice(["","Damaged carton","Partial delivery","Driver delay",np.nan],
                             n_records,p=[0.4,0.15,0.15,0.1,0.2])
})
# MCAR: random fuel price
mcar_idx=np.random.choice(df.index,size=int(0.05*n_records),replace=False)
df.loc[mcar_idx,"Fuel_Price_EUR_per_litre"]=np.nan
# MAR: weights missing for light parcels in C2/C3
mar_idx=df[(df["Carrier"].isin(["C2","C3"]))&(df["Weight_kg"]<10)].sample(frac=0.4,random_state=42).index
df.loc[mar_idx,"Weight_kg"]=np.nan
# MNAR: delivery time missing when very late in C4
mnar_idx=df[(df["Carrier"]=="C4")&(df["Delivery_Time_hrs"]>32)].sample(frac=0.5,random_state=42).index
df.loc[mnar_idx,"Delivery_Time_hrs"]=np.nan
# Sensor weekend gaps per WH
for wh in warehouses:
    gap=np.random.choice(weeks,2,replace=False)
    mask=(df["Warehouse_ID"]==wh)&(df["Week"].isin(gap))
    df.loc[mask,"Temperature_C"]=np.nan

df["Weight_kg"]=df["Weight_kg"].clip(0.5,40)
df["Delivery_Time_hrs"]=df["Delivery_Time_hrs"].clip(10,60)
df["Temperature_C"]=df["Temperature_C"].clip(-10,15)

df.to_csv("/Users/gollapsi/Documents/17_Hof_Lecture_Code_Pingo/Supply_Chain_Analytics/data/Global_Shipments.csv",index=False)
print("Created Global_Shipments.csv",df.shape); df.head(3)


Created Global_Shipments.csv (1000, 14)


Unnamed: 0,Shipment_ID,Week,Warehouse_ID,Store_ID,Carrier,Product_Category,Weight_kg,Delivery_Time_hrs,Fuel_Price_EUR_per_litre,Temperature_C,Promo_Flag,On_Time_Flag,Weekly_Sales_EUR,Notes
0,SH00000,2024-02-18,C,S07,C3,Frozen,19.06,29.3,1.67,2.7,0,1,64724.47,
1,SH00001,2024-05-19,B,S05,C3,Grocery,10.35,30.4,1.61,2.5,0,1,61723.79,
2,SH00002,2024-04-14,C,S07,C1,Clothing,14.93,34.7,1.63,,1,1,54098.4,Partial delivery
