In [None]:
import pandas as pd
import numpy as np

# Load base dataset (only for Customer_ID, Gender, Age sampling)
csv_path = "E-Commerce_Dataset/retail_sales_dataset.csv"
sales_df = pd.read_csv(csv_path)

# --- Step 1: Define categories & gender weights ---
category_weights = {
    "Male": {
        "Electronics": 0.4,
        "Clothing": 0.25,
        "Groceries": 0.15,
        "Beauty": 0.1,
        "Sports": 0.1
    },
    "Female": {
        "Electronics": 0.2,
        "Clothing": 0.3,
        "Groceries": 0.15,
        "Beauty": 0.25,
        "Sports": 0.1
    },
    "Other": {
        "Electronics": 0.25,
        "Clothing": 0.25,
        "Groceries": 0.25,
        "Beauty": 0.15,
        "Sports": 0.1
    }
}

def generate_synthetic_data(num_customers=50, total_transactions=1000, seed=42):
    np.random.seed(seed)

    # --- Step 2: Get unique customers from dataset ---
    unique_customers = sales_df[["Customer_ID", "Gender", "Age"]].drop_duplicates()

    if num_customers <= len(unique_customers):
        # Enough unique customers → sample without replacement
        sampled_customers = unique_customers.sample(
            n=num_customers, replace=False, random_state=seed
        ).reset_index(drop=True)
    else:
        # Not enough → take all and generate synthetic extras
        sampled_customers = unique_customers.copy().reset_index(drop=True)

        needed = num_customers - len(unique_customers)
        synthetic_customers = pd.DataFrame({
            "Customer_ID": [f"SYN{1000+i}" for i in range(needed)],
            "Gender": np.random.choice(["Male", "Female", "Other"], size=needed, p=[0.45, 0.45, 0.1]),
            "Age": np.random.randint(18, 65, size=needed)
        })
        sampled_customers = pd.concat([sampled_customers, synthetic_customers], ignore_index=True)

    # --- Step 3: Assign transactions across customers ---
    trans_counts = np.random.multinomial(
        total_transactions, [1/num_customers]*num_customers
    )

    # --- Step 4: Generate transactions ---
    transactions = []
    txn_id = 1
    for i, row in sampled_customers.iterrows():
        customer_id, gender, age = row["Customer_ID"], row["Gender"], row["Age"]

        weights = category_weights.get(gender, category_weights["Other"])
        categories = list(weights.keys())
        probs = list(weights.values())

        for _ in range(trans_counts[i]):
            category = np.random.choice(categories, p=probs)

            transaction = {
                "Transaction_ID": f"TXN{txn_id:06d}",
                "Date": pd.to_datetime("2022-01-01") +
                        pd.to_timedelta(np.random.randint(0, 900), unit="D"),
                "Customer_ID": customer_id,
                "Gender": gender,
                "Age": age,
                "Product_Category": category,
                "Quantity": np.random.randint(1, 6),
                "Price_per_Unit": np.random.randint(50, 550),
            }
            transaction["Total_Amount"] = (
                transaction["Quantity"] * transaction["Price_per_Unit"]
            )
            transactions.append(transaction)
            txn_id += 1

    synthetic_df = pd.DataFrame(transactions)

    # --- Step 5: Customer summary ---
    synthetic_customers = (
        synthetic_df.groupby(["Customer_ID", "Gender", "Age"])
        .agg(Transaction_Count=("Transaction_ID", "count"),
             Total_Spent=("Total_Amount", "sum"))
        .reset_index()
    )

    return synthetic_df, synthetic_customers


# Example usage:
synthetic_df, synthetic_customers = generate_synthetic_data(num_customers=120, total_transactions=10000)

# Save
synthetic_df.to_csv("synthetic_dataset.csv", index=False)
synthetic_customers.to_csv("synthetic_customers.csv", index=False)

print("✅ synthetic_dataset.csv created with transactions")
print("✅ synthetic_customers.csv created with customer summaries")


✅ synthetic_dataset.csv created with transactions
✅ synthetic_customers.csv created with customer summaries


: 