In [1]:
import pandas as pd, numpy as np, random
np.random.seed(42); random.seed(42)

# ----------------------------
# 1. Basic setup
# ----------------------------
n_records = 1000
weeks = pd.date_range("2024-01-01", periods=20, freq="W")
warehouses = ["A", "B", "C", "D"]
stores = [f"S{i:02d}" for i in range(1, 13)]
carriers = ["C1", "C2", "C3", "C4"]
categories = ["Grocery", "Electronics", "Clothing", "Frozen"]

# Map carriers to transport mode
carrier_mode = {
    "C1": "Road",
    "C2": "Road",
    "C3": "Air",
    "C4": "Rail"
}

# Store "regions" to create a distance structure
regions = ["North", "South", "East", "West"]
store_region_map = {s: random.choice(regions) for s in stores}

# Base distance per region (roughly)
region_base_distance = {
    "North": 200,
    "South": 450,
    "East": 300,
    "West": 350
}

# Warehouse distance offsets (e.g., A is central, D is more remote)
wh_offset = {"A": 0, "B": 40, "C": -30, "D": 80}

# ----------------------------
# 2. Core categorical fields
# ----------------------------
df = pd.DataFrame({
    "Shipment_ID": [f"SH{i:05d}" for i in range(n_records)],
    "Week": np.random.choice(weeks, n_records),
    "Warehouse_ID": np.random.choice(warehouses, n_records),
    "Store_ID": np.random.choice(stores, n_records),
    "Carrier": np.random.choice(carriers, n_records, p=[0.3, 0.25, 0.25, 0.2]),
    "Product_Category": np.random.choice(categories, n_records, p=[0.4, 0.25, 0.2, 0.15]),
    "Promo_Flag": np.random.choice([0, 1], n_records, p=[0.8, 0.2])
})

# Add store region
df["Region"] = df["Store_ID"].map(store_region_map)

# Mode derived from carrier
df["Mode"] = df["Carrier"].map(carrier_mode)

# ----------------------------
# 3. Distance & physical measures
# ----------------------------
# Base distance = region distance + warehouse offset + noise
base_dist = df["Region"].map(region_base_distance)
offset = df["Warehouse_ID"].map(wh_offset)
df["Distance_km"] = (base_dist + offset + np.random.normal(0, 30, n_records)).clip(20, 800).round(1)

# Weight: depends slightly on category
cat_weight_mean = {
    "Grocery": 18,
    "Electronics": 10,
    "Clothing": 8,
    "Frozen": 15
}
weight_mean = df["Product_Category"].map(cat_weight_mean)
df["Weight_kg"] = (np.random.normal(weight_mean, 4).clip(0.5, 40)).round(2)

# ----------------------------
# 4. Delivery time derived from mode & distance
# ----------------------------
# Simple "speed" assumptions
speed_kmh = {
    "Road": 55,
    "Air": 600,   # flying speed, plus handling
    "Rail": 40
}
handling_hours = {"A": 6, "B": 5, "C": 7, "D": 5}

base_travel = df["Distance_km"] / df["Mode"].map(speed_kmh)
handling = df["Warehouse_ID"].map(handling_hours)

# Add randomness and clip
df["Delivery_Time_hrs"] = (
    base_travel + handling + np.random.normal(0, 4, n_records)
).clip(8, 72).round(1)

# ----------------------------
# 5. Fuel price per week + small noise
# ----------------------------
weekly_fuel_price = {w: round(np.random.normal(1.7, 0.05), 2) for w in weeks}
df["Fuel_Price_EUR_per_litre"] = (
    df["Week"].map(weekly_fuel_price) + np.random.normal(0, 0.03, n_records)
).round(2)

# ----------------------------
# 6. Temperature (sensor data)
# ----------------------------
# Roughly colder in early weeks, warmer later, plus WH noise
week_index = df["Week"].rank(method="dense").astype(int)
df["Temperature_C"] = (
    np.random.normal(5, 2, n_records) + (week_index - week_index.mean()) * 0.15
).round(1)
df["Temperature_C"] = df["Temperature_C"].clip(-10, 20)

# ----------------------------
# 7. Weekly sales with promo & category patterns
# ----------------------------
cat_base_sales = {
    "Grocery": 50000,
    "Electronics": 80000,
    "Clothing": 40000,
    "Frozen": 30000
}
base_sales = df["Product_Category"].map(cat_base_sales)
promo_multiplier = np.where(df["Promo_Flag"] == 1, 1.3, 1.0)

df["Weekly_Sales_EUR"] = (
    base_sales * promo_multiplier * np.random.normal(1.0, 0.15, n_records)
).round(2)

# ----------------------------
# 8. Service KPI: On-Time flag
# ----------------------------
# SLA = base_travel + handling + 10% buffer
sla_hours = (base_travel + handling) * 1.1
df["On_Time_Flag"] = (df["Delivery_Time_hrs"] <= sla_hours).astype(int)

# Make one carrier systematically worse (C4 often late)
mask_c4 = df["Carrier"] == "C4"
flip_idx = df[mask_c4].sample(frac=0.25, random_state=42).index
df.loc[flip_idx, "On_Time_Flag"] = 0

# ----------------------------
# 9. Unstructured notes
# ----------------------------
df["Notes"] = np.random.choice(
    ["", "Damaged carton", "Partial delivery", "Driver delay", np.nan],
    n_records,
    p=[0.4, 0.15, 0.15, 0.1, 0.2]
)

# ----------------------------
# 10. Missingness patterns (MCAR / MAR / MNAR / hidden)
# ----------------------------

# MCAR: random fuel price missing
mcar_idx = np.random.choice(df.index, size=int(0.05 * n_records), replace=False)
df.loc[mcar_idx, "Fuel_Price_EUR_per_litre"] = np.nan

# MAR: weights missing for light parcels in C2/C3
mar_idx = df[
    (df["Carrier"].isin(["C2", "C3"])) & (df["Weight_kg"] < 10)
].sample(frac=0.4, random_state=42).index
df.loc[mar_idx, "Weight_kg"] = np.nan

# MNAR: delivery time missing when very late in C4
mnar_idx = df[
    (df["Carrier"] == "C4") & (df["Delivery_Time_hrs"] > 32)
].sample(frac=0.5, random_state=42).index
df.loc[mnar_idx, "Delivery_Time_hrs"] = np.nan

# Sensor weekend gaps per WH: temperature missing for some weeks
for wh in warehouses:
    gap_weeks = np.random.choice(weeks, 2, replace=False)
    mask = (df["Warehouse_ID"] == wh) & (df["Week"].isin(gap_weeks))
    df.loc[mask, "Temperature_C"] = np.nan

# Hidden missing values: -1 as "no sales reported" for a few store-weeks
hidden_idx = df.sample(frac=0.03, random_state=99).index
df.loc[hidden_idx, "Weekly_Sales_EUR"] = -1  # later in class: treat as missing

# ----------------------------
# 11. Duplicates for teaching
# ----------------------------
# Intentionally duplicate a few rows to show duplicate detection
dup_idx = df.sample(5, random_state=123)
df = pd.concat([df, dup_idx], ignore_index=True)

# ----------------------------
# 12. Save
# ----------------------------
out_path = "/Users/gollapsi/Documents/17_Hof_Lecture_Code_Pingo/Supply_Chain_Analytics/data/Global_Shipments_v2.csv"
df.to_csv(out_path, index=False)

print("Created Global_Shipments_v2.csv", df.shape)
print(df.head(3))


Created Global_Shipments_v2.csv (1005, 17)
  Shipment_ID       Week Warehouse_ID Store_ID Carrier Product_Category  \
0     SH00000 2024-02-18            C      S07      C3           Frozen   
1     SH00001 2024-05-19            B      S05      C3          Grocery   
2     SH00002 2024-04-14            C      S07      C1         Clothing   

   Promo_Flag Region  Mode  Distance_km  Weight_kg  Delivery_Time_hrs  \
0           1  North   Air        114.2      10.45                8.2   
1           0  South   Air        515.9      19.46                8.0   
2           0  North  Road        152.9      10.61                8.0   

   Fuel_Price_EUR_per_litre  Temperature_C  Weekly_Sales_EUR  On_Time_Flag  \
0                      1.62            3.2          31186.92             0   
1                      1.78            6.3          44982.05             0   
2                      1.65            NaN          36623.22             1   

            Notes  
0  Damaged carton  
1         

In [2]:
df

Unnamed: 0,Shipment_ID,Week,Warehouse_ID,Store_ID,Carrier,Product_Category,Promo_Flag,Region,Mode,Distance_km,Weight_kg,Delivery_Time_hrs,Fuel_Price_EUR_per_litre,Temperature_C,Weekly_Sales_EUR,On_Time_Flag,Notes
0,SH00000,2024-02-18,C,S07,C3,Frozen,1,North,Air,114.2,10.45,8.2,1.62,3.2,31186.92,0,Damaged carton
1,SH00001,2024-05-19,B,S05,C3,Grocery,0,South,Air,515.9,19.46,8.0,1.78,6.3,44982.05,0,
2,SH00002,2024-04-14,C,S07,C1,Clothing,0,North,Road,152.9,10.61,8.0,1.65,,36623.22,1,
3,SH00003,2024-03-17,B,S03,C3,Electronics,0,East,Air,345.9,13.53,9.0,1.74,5.8,99254.34,0,Driver delay
4,SH00004,2024-02-25,C,S12,C1,Grocery,0,North,Road,200.2,17.70,13.6,1.73,8.6,55523.10,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,SH00131,2024-02-18,C,S04,C4,Grocery,0,South,Rail,445.2,15.92,18.8,1.61,3.7,44425.46,1,Damaged carton
1001,SH00203,2024-02-18,A,S01,C3,Frozen,0,North,Air,121.4,14.95,9.5,1.64,5.6,35258.63,0,
1002,SH00050,2024-02-25,C,S03,C1,Electronics,0,East,Road,227.6,14.14,8.0,1.68,3.7,-1.00,1,Driver delay
1003,SH00585,2024-04-28,D,S05,C1,Frozen,0,South,Road,492.2,11.86,8.0,1.79,6.4,34130.74,1,


In [3]:
import pandas as pd, numpy as np, random
np.random.seed(42); random.seed(42)

# ----------------------------
# 1. Basic setup
# ----------------------------
n_records = 1000
weeks = pd.date_range("2024-01-01", periods=20, freq="W")
warehouses = ["A", "B", "C", "D"]
stores = [f"S{i:02d}" for i in range(1, 13)]
carriers = ["C1", "C2", "C3", "C4"]
categories = ["Grocery", "Electronics", "Clothing", "Frozen"]

# Carrier → Mode mapping
carrier_mode = {
    "C1": "Road",
    "C2": "Road",
    "C3": "Air",
    "C4": "Rail"
}

# ----------------------------
# 2. Add synthetic coordinates
# (Germany bounding box approx)
# ----------------------------
# Warehouse coordinates (synthetic but plausible)
wh_coords = {
    "A": (49.4, 11.0),   # Nuremberg area
    "B": (50.1, 8.7),    # Frankfurt area
    "C": (48.8, 9.2),    # Stuttgart area
    "D": (52.5, 13.4),   # Berlin area
}

# Store coordinates: random around WH regions
def random_near(lat, lon):
    return (
        lat + np.random.normal(0, 0.5),
        lon + np.random.normal(0, 0.5)
    )

store_coords = {}
for s in stores:
    wh = random.choice(list(wh_coords.keys()))
    store_coords[s] = random_near(*wh_coords[wh])

# Region assignment (for distance logic)
regions = ["North", "South", "East", "West"]
store_region_map = {s: random.choice(regions) for s in stores}

region_base_distance = {
    "North": 200,
    "South": 450,
    "East": 300,
    "West": 350
}

wh_offset = {"A": 0, "B": 40, "C": -30, "D": 80}

# ----------------------------
# 3. Build base DataFrame
# ----------------------------
df = pd.DataFrame({
    "Shipment_ID": [f"SH{i:05d}" for i in range(n_records)],
    "Week": np.random.choice(weeks, n_records),
    "Warehouse_ID": np.random.choice(warehouses, n_records),
    "Store_ID": np.random.choice(stores, n_records),
    "Carrier": np.random.choice(carriers, n_records, p=[0.3,0.25,0.25,0.2]),
    "Product_Category": np.random.choice(categories, n_records, p=[0.4,0.25,0.2,0.15]),
    "Promo_Flag": np.random.choice([0,1], n_records, p=[0.8,0.2])
})

df["Region"] = df["Store_ID"].map(store_region_map)
df["Mode"] = df["Carrier"].map(carrier_mode)

# Add coordinates
df["WH_Lat"] = df["Warehouse_ID"].map(lambda x: wh_coords[x][0])
df["WH_Lon"] = df["Warehouse_ID"].map(lambda x: wh_coords[x][1])
df["Store_Lat"] = df["Store_ID"].map(lambda x: store_coords[x][0])
df["Store_Lon"] = df["Store_ID"].map(lambda x: store_coords[x][1])

# ----------------------------
# 4. Distance generation
# ----------------------------
base_dist = df["Region"].map(region_base_distance)
offset = df["Warehouse_ID"].map(wh_offset)
df["Distance_km"] = (
    base_dist + offset + np.random.normal(0, 30, n_records)
).clip(20, 800).round(1)

# ----------------------------
# 5. Weight per category
# ----------------------------
cat_weight_mean = {
    "Grocery": 18,
    "Electronics": 10,
    "Clothing": 8,
    "Frozen": 15
}
df["Weight_kg"] = (
    np.random.normal(df["Product_Category"].map(cat_weight_mean), 4)
).clip(0.5, 40).round(2)

# ----------------------------
# 6. Delivery time logic
# ----------------------------
speed_kmh = {"Road": 55, "Air": 600, "Rail": 40}
handling_hours = {"A":6, "B":5, "C":7, "D":5}

base_travel = df["Distance_km"] / df["Mode"].map(speed_kmh)
handling = df["Warehouse_ID"].map(handling_hours)

df["Delivery_Time_hrs"] = (
    base_travel + handling + np.random.normal(0,4,n_records)
).clip(8,72).round(1)

# ----------------------------
# 7. Fuel price
# ----------------------------
weekly_fuel_price = {w: round(np.random.normal(1.7,0.05),2) for w in weeks}
df["Fuel_Price_EUR_per_litre"] = (
    df["Week"].map(weekly_fuel_price) + np.random.normal(0, 0.02, n_records)
).round(2)

# ----------------------------
# 8. Sales logic
# ----------------------------
cat_sales = {"Grocery":50000,"Electronics":80000,"Clothing":40000,"Frozen":30000}
promo_mult = np.where(df["Promo_Flag"]==1, 1.3, 1.0)
df["Weekly_Sales_EUR"] = (
    df["Product_Category"].map(cat_sales) *
    promo_mult *
    np.random.normal(1.0,0.15,n_records)
).round(2)

# ----------------------------
# 9. SLA, Promised Time, LateBy
# ----------------------------
sla_hours = (base_travel + handling) * 1.1
df["Promised_Time_hrs"] = np.ceil(sla_hours / 4) * 4  # round to next 4h block

df["LateBy_hours"] = (df["Delivery_Time_hrs"] - df["Promised_Time_hrs"]).round(1)
df["On_Time_Flag"] = (df["LateBy_hours"] <= 0).astype(int)

# C4 systematically worse
c4_idx = df[df["Carrier"]=="C4"].sample(frac=0.25,random_state=42).index
df.loc[c4_idx, "On_Time_Flag"] = 0

# ----------------------------
# 10. Notes (unstructured)
# ----------------------------
df["Notes"] = np.random.choice(
    ["","Damaged carton","Partial delivery","Driver delay",np.nan],
    n_records,
    p=[0.4,0.15,0.15,0.1,0.2]
)

# ----------------------------
# 11. Missingness: MCAR, MAR, MNAR, Hidden
# ----------------------------
# MCAR
mcar = np.random.choice(df.index, size=int(0.05*n_records), replace=False)
df.loc[mcar, "Fuel_Price_EUR_per_litre"] = np.nan

# MAR
mar = df[(df["Carrier"].isin(["C2","C3"]))&(df["Weight_kg"]<10)].sample(frac=0.4,random_state=42).index
df.loc[mar, "Weight_kg"] = np.nan

# MNAR
mnar = df[(df["Carrier"]=="C4")&(df["Delivery_Time_hrs"]>32)].sample(frac=0.5,random_state=42).index
df.loc[mnar, "Delivery_Time_hrs"] = np.nan

# Hidden missing (-1)
hidden = df.sample(frac=0.03,random_state=99).index
df.loc[hidden, "Weekly_Sales_EUR"] = -1

# Sensor gaps per WH
for wh in warehouses:
    gap_weeks = np.random.choice(weeks, 2, replace=False)
    mask = (df["Warehouse_ID"]==wh)&(df["Week"].isin(gap_weeks))
    df.loc[mask, "Temperature_C"] = np.nan

# ----------------------------
# 12. Duplicates
# ----------------------------
dup = df.sample(5,random_state=123)
df = pd.concat([df, dup], ignore_index=True)

# ----------------------------
# 13. Save
# ----------------------------
path = "/Users/gollapsi/Documents/17_Hof_Lecture_Code_Pingo/Supply_Chain_Analytics/data/Global_Shipments_v3.csv"
df.to_csv(path, index=False)

print("Created Global_Shipments_v3.csv", df.shape)
df.head(3)


Created Global_Shipments_v3.csv (1005, 23)


Unnamed: 0,Shipment_ID,Week,Warehouse_ID,Store_ID,Carrier,Product_Category,Promo_Flag,Region,Mode,WH_Lat,...,Distance_km,Weight_kg,Delivery_Time_hrs,Fuel_Price_EUR_per_litre,Weekly_Sales_EUR,Promised_Time_hrs,LateBy_hours,On_Time_Flag,Notes,Temperature_C
0,SH00000,2024-01-21,C,S04,C2,Electronics,0,South,Road,48.8,...,407.0,14.93,16.0,1.68,76117.25,16.0,0.0,1,,
1,SH00001,2024-02-04,D,S10,C2,Frozen,1,South,Road,52.5,...,508.2,,8.3,1.7,46021.6,16.0,-7.7,1,Driver delay,
2,SH00002,2024-05-12,C,S05,C3,Electronics,0,West,Air,48.8,...,370.0,17.05,8.0,1.59,73707.24,12.0,-4.0,1,,


In [5]:
df.columns

Index(['Shipment_ID', 'Week', 'Warehouse_ID', 'Store_ID', 'Carrier',
       'Product_Category', 'Promo_Flag', 'Region', 'Mode', 'WH_Lat', 'WH_Lon',
       'Store_Lat', 'Store_Lon', 'Distance_km', 'Weight_kg',
       'Delivery_Time_hrs', 'Fuel_Price_EUR_per_litre', 'Weekly_Sales_EUR',
       'Promised_Time_hrs', 'LateBy_hours', 'On_Time_Flag', 'Notes',
       'Temperature_C'],
      dtype='object')