In [1]:
! pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/1.9 MB 1.9 MB/s eta 0:00:01
   -------------------------------------- - 1.8/1.9 MB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 4.2 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.1.0


In [3]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()

# Define constants
NUM_ROWS = 1000  # Adjust this for desired dataset size
EXCHANGE_RATE = 3700  # 1 USD = 3,700 UGX
COUNTRIES = ["China", "India", "Germany", "USA", "Japan", "South Korea", "Kenya", "South Africa"]
PRODUCTS = {
    "Electronics": {"HS_Codes": ["8542", "8517", "8523"], "Tax_Rate": [0.1, 0.15]},
    "Machinery": {"HS_Codes": ["8479", "8481", "8430"], "Tax_Rate": [0.15, 0.2]},
    "Textiles": {"HS_Codes": ["5407", "5408", "6110"], "Tax_Rate": [0.05, 0.1]},
    "Food Products": {"HS_Codes": ["0403", "0702", "1701"], "Tax_Rate": [0.05, 0.1]},
    "Pharmaceuticals": {"HS_Codes": ["3004", "2937", "2941"], "Tax_Rate": [0.05, 0.15]},
}

def generate_dataset(num_rows):
    data = []
    for _ in range(num_rows):
        # Randomly select product category and attributes
        product = np.random.choice(list(PRODUCTS.keys()))
        hs_code = np.random.choice(PRODUCTS[product]["HS_Codes"])
        tax_rate = np.random.choice(PRODUCTS[product]["Tax_Rate"])
        country = np.random.choice(COUNTRIES)
        quantity = np.random.randint(10, 5000)
        net_mass = np.random.randint(50, 10000)
        gross_mass = net_mass + np.random.randint(10, 500)
        cif_usd = np.random.uniform(1000, 500000)
        cif_ugx = cif_usd * EXCHANGE_RATE
        unit_price_actual = cif_ugx / quantity
        unit_price_predicted = unit_price_actual * np.random.uniform(0.8, 1.2)  # ±20% noise
        deviation = (unit_price_actual - unit_price_predicted) / unit_price_predicted
        valuation_label = 1 if deviation > 0.2 else 0 if deviation < -0.2 else np.nan

        data.append({
            "Date": fake.date_between(start_date="-3y", end_date="today"),
            "Item_Description": product,
            "HS_Code": hs_code,
            "Country_of_Origin": country,
            "Quantity": quantity,
            "Net_Mass_kg": net_mass,
            "Gross_Mass_kg": gross_mass,
            "CIF_Value_USD": round(cif_usd, 2),
            "CIF_Value_UGX": round(cif_ugx, 2),
            "Unit_Price_Actual_UGX": round(unit_price_actual, 2),
            "Unit_Price_Predicted_UGX": round(unit_price_predicted, 2),
            "Tax_Rate": tax_rate,
            "Currency_Code": "USD",
            "Valuation_Label": valuation_label
        })
    
    return pd.DataFrame(data)

# Generate and save CSV
df = generate_dataset(NUM_ROWS)
df.to_csv("uganda_import_valuation_large.csv", index=False)
