In [2]:
# generate_retail_data.py
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from faker import Faker

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

print("=== Generating Synthetic Retail Data (African Markets) ===")

# Initialize Faker
fake = Faker()

# Parameters
n_rows = 1000
start_date = pd.Timestamp('2023-01-01')
end_date = pd.Timestamp('2025-08-12')

# African countries (choose 10 with realistic sales distribution)
african_countries = [
    'Nigeria', 'South Africa', 'Kenya', 'Ghana', 'Uganda',
    'Tanzania', 'Ethiopia', 'Morocco', 'Egypt', 'Rwanda'
]

# Weighted by population/economy: Nigeria, SA, Kenya higher
country_weights = [0.2, 0.15, 0.15, 0.1, 0.08, 0.08, 0.07, 0.07, 0.07, 0.03]

# Generate data
invoice_nos = [f'INV{1000 + i}' for i in range(n_rows)]

# Stock codes and product descriptions using Faker
stock_codes = [f'P{100 + random.randint(100, 999)}' for _ in range(n_rows)]
descriptions = [fake.catch_phrase() for _ in range(n_rows)]  # e.g., "Synergistic solution", "Scalable architecture"

# Quantity and price
quantities = np.random.randint(1, 51, size=n_rows)  # 1 to 50
unit_prices = np.round(np.random.uniform(1.0, 100.0, size=n_rows), 2)  # $1–$100

# Customers: 100 unique IDs and names
unique_customer_ids = [f'C{100 + i}' for i in range(1, 101)]
customer_names = [fake.name() for _ in range(100)]  # One name per customer ID

# Map customer ID to name
customer_id_to_name = dict(zip(unique_customer_ids, customer_names))

# Assign customer IDs to rows
customer_ids = random.choices(unique_customer_ids, k=n_rows)

# Assign countries with weights
countries_selected = random.choices(african_countries, weights=country_weights, k=n_rows)

# Dates: Random over 2+ years
date_range = (end_date - start_date).days
invoice_dates = [start_date + timedelta(days=random.randint(0, date_range)) for _ in range(n_rows)]

# Create DataFrame
df = pd.DataFrame({
    'InvoiceNo': invoice_nos,
    'StockCode': stock_codes,
    'Description': descriptions,
    'Quantity': quantities,
    'InvoiceDate': invoice_dates,
    'UnitPrice': unit_prices,
    'CustomerID': customer_ids,
    'Country': countries_selected
})

# Add TotalSales
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

# Sort by date for realism
df = df.sort_values('InvoiceDate').reset_index(drop=True)

# Save to CSV
df.to_csv('synthetic_retail_africa.csv', index=False)
print(f"✅ Generated {len(df)} rows of synthetic retail data from African countries.")
print("📁 Saved as 'synthetic_retail_africa.csv'")

=== Generating Synthetic Retail Data (African Markets) ===
✅ Generated 1000 rows of synthetic retail data from African countries.
📁 Saved as 'synthetic_retail_africa.csv'
