In [3]:
import pandas as pd
import numpy as np
import random

# Reproducibility
np.random.seed(42)
random.seed(42)

# Setup
suppliers = ["Supplier X", "Supplier Y", "Supplier Z", "Supplier A", "Supplier B"]
chemicals = ["Solvent A", "Solvent B", "Catalyst C", "Reactant D", "Buffer E"]
n_rows = 20000

# Create base dataframe
procurement_data = pd.DataFrame({
    "order_id": range(1, n_rows + 1),
    "chemical_ordered": np.random.choice(chemicals, size=n_rows),
    "supplier": np.random.choice(suppliers, size=n_rows),
    "quantity_ordered": np.random.uniform(100, 5000, size=n_rows).round(2),
    "cost_usd": np.random.uniform(50, 5000, size=n_rows).round(2),
    "order_date": pd.date_range(start="2023-01-01", periods=n_rows, freq="h").astype(str)
})

# Change column types to object so we can safely insert strings/NaNs later
procurement_data["quantity_ordered"] = procurement_data["quantity_ordered"].astype("object")
procurement_data["cost_usd"] = procurement_data["cost_usd"].astype("object")

# 1. Unit inconsistencies (e.g., '2.5 kg')
unit_indices = np.random.choice(procurement_data.index, size=2000, replace=False)
procurement_data.loc[unit_indices, "quantity_ordered"] = (
    (pd.to_numeric(procurement_data.loc[unit_indices, "quantity_ordered"]) / 1000).round(2).astype(str) + " kg"
)

# 2. Insert missing values
for col in ["quantity_ordered", "cost_usd"]:
    procurement_data.loc[np.random.choice(procurement_data.index, 300, replace=False), col] = np.nan

# 3. Add invalid cost values
procurement_data.loc[random.sample(list(procurement_data.index), 100), "cost_usd"] = -50
procurement_data.loc[random.sample(list(procurement_data.index), 50), "cost_usd"] = "unknown"
procurement_data.loc[random.sample(list(procurement_data.index), 30), "cost_usd"] = 999999

# 4. Inconsistent order dates
date_formats = [
    lambda d: d,
    lambda d: pd.to_datetime(d).strftime("%d/%m/%Y"),
    lambda d: pd.to_datetime(d).strftime("%b %d, %Y"),
    lambda d: None
]
procurement_data["order_date"] = procurement_data["order_date"].apply(lambda d: random.choice(date_formats)(d))

# 5. Add duplicate rows
duplicates = procurement_data.sample(n=300)
procurement_data = pd.concat([procurement_data, duplicates], ignore_index=True)

# Save
procurement_data.to_csv("procurement_cost_reports_messy.csv", index=False)
print("✅ Messy procurement dataset created and saved.")

✅ Messy procurement dataset created and saved.
