In [2]:
import numpy as np
import pandas as pd
from faker import Faker
from datetime import datetime

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)

# Number of records
num_rows = 100_000

# Define synthetic values
hs_codes = ['8542.31', '8703.24', '8517.62', '8471.30', '8523.49',
            '9403.20', '6403.99', '6110.30', '6204.63', '7321.11']
products = ['Integrated Circuits', 'Motor Vehicles', 'Telephone Sets',
            'Laptop Computers', 'Storage Units', 'Wooden Furniture',
            'Leather Footwear', 'Cotton T-shirts', 'Woolen Coats',
            'Stainless Steel Cookware']
countries = ['China', 'Germany', 'USA', 'Japan', 'South Korea',
             'India', 'Vietnam', 'Italy', 'France', 'Brazil']
quantity_units = ['kg', 'liters', 'units', 'pairs', 'boxes', 'crates']

# Pre-parse start and end dates
start_date = datetime.strptime("2020-01-01", "%Y-%m-%d").date()
end_date = datetime.strptime("2024-12-31", "%Y-%m-%d").date()

def generate_synthetic_row():
    hs_code = np.random.choice(hs_codes)
    product = np.random.choice(products)
    country = np.random.choice(countries)
    quantity = np.round(np.random.lognormal(mean=5, sigma=1.5), 2)
    quantity_unit = np.random.choice(quantity_units)
    weight = np.round(quantity * np.random.uniform(0.2, 5.0), 2)
    net_mass = np.round(weight * 0.97, 2)
    gross_mass = np.round(weight * 1.05, 2)

    fob = np.round(np.random.weibull(2.5) * 5000, 2)
    freight = np.round(fob * np.random.uniform(0.005, 0.15), 2)
    insurance = np.round(fob * np.random.uniform(0.002, 0.02), 2)
    cif_value = fob + freight + insurance

    exchange_rate = np.random.uniform(3500, 3900)
    cif_local = cif_value * exchange_rate
    unit_price_actual = cif_local / quantity
    unit_price_predicted = unit_price_actual * np.random.normal(1.0, 0.15)
    under_over = int(unit_price_actual > unit_price_predicted)

    return {
        "Date": fake.date_between(start_date=start_date, end_date=end_date),
        "HS_Code": hs_code,
        "Item_Description": product.split()[0],
        "Country_of_Origin": country,
        "Country_of_Sale": np.random.choice(["China", "Germany", "Japan", "USA", "UK"]),
        "Port_of_Shipment": fake.city(),
        "Quantity": quantity,
        "Quantity_Unit": quantity_unit,
        "Net_Mass_kg": net_mass,
        "Gross_Mass_kg": gross_mass,
        "FOB_Value_USD": fob,
        "Freight_USD": freight,
        "Insurance_USD": insurance,
        "CIF_Value_USD": np.round(cif_value, 2),
        "CIF_Value_UGX": np.round(cif_local, 2),
        "Unit_Price_Actual_UGX": np.round(unit_price_actual, 2),
        "Unit_Price_Predicted_UGX": np.round(unit_price_predicted, 2),
        "Under_Over_Valued": under_over,
        "Tax_Rate": np.random.choice([0.05, 0.1, 0.15, 0.18, 0.2, 0.25]),
        "Currency_Code": 'USD',
        "Valuation_Label": np.random.choice(["Manual", "Automated", "Audit"], p=[0.2, 0.7, 0.1])
    }

# Generate the dataset
print("Generating synthetic data...")
synthetic_data = [generate_synthetic_row() for _ in range(num_rows)]
df_synthetic = pd.DataFrame(synthetic_data)

# Save to CSV
csv_filename = "synthetic_trade_data.csv"
df_synthetic.to_csv(csv_filename, index=False)
print(f"Dataset with {num_rows:,} records saved to {csv_filename}")


Generating synthetic data...
Dataset with 100,000 records saved to synthetic_trade_data.csv
