In [None]:
import pandas as pd
import numpy as np

# Load base dataset (only for Customer_ID, Gender, Age sampling)
csv_path = "E-Commerce_Dataset/retail_sales_dataset.csv"
sales_df = pd.read_csv(csv_path)

# --- Step 1: Define categories & gender weights ---
category_weights = {
    "Male": {
        "Electronics": 0.4,
        "Clothing": 0.25,
        "Groceries": 0.15,
        "Beauty": 0.1,
        "Sports": 0.1
    },
    "Female": {
        "Electronics": 0.2,
        "Clothing": 0.3,
        "Groceries": 0.15,
        "Beauty": 0.25,
        "Sports": 0.1
    },
    "Other": {
        "Electronics": 0.25,
        "Clothing": 0.25,
        "Groceries": 0.25,
        "Beauty": 0.15,
        "Sports": 0.1
    }
}

def generate_synthetic_data(num_customers=50, total_transactions=1000, seed=42):
    np.random.seed(seed)

    # --- Step 2: Sample customers ---
    sampled_customers = sales_df[["Customer_ID", "Gender", "Age"]].drop_duplicates()
    sampled_customers = sampled_customers.sample(
        n=num_customers, replace=True, random_state=seed
    ).reset_index(drop=True)

    # --- Step 3: Assign transactions to customers ---
    trans_counts = np.random.multinomial(
        total_transactions, [1/num_customers]*num_customers
    )

    # --- Step 4: Generate transactions ---
    transactions = []
    txn_id = 1
    for i, row in sampled_customers.iterrows():
        customer_id, gender, age = row["Customer_ID"], row["Gender"], row["Age"]

        # Use gender-based weights, fallback to "Other"
        weights = category_weights.get(gender, category_weights["Other"])
        categories = list(weights.keys())
        probs = list(weights.values())

        for _ in range(trans_counts[i]):
            category = np.random.choice(categories, p=probs)

            transaction = {
                "Transaction_ID": f"TXN{txn_id:04d}",
                "Date": pd.to_datetime("2022-01-01") +
                        pd.to_timedelta(np.random.randint(0, 900), unit="D"),
                "Customer_ID": customer_id,
                "Gender": gender,
                "Age": age,
                "Product_Category": category,
                "Quantity": np.random.randint(1, 6),
                "Price_per_Unit": np.random.randint(50, 550),
            }
            transaction["Total_Amount"] = (
                transaction["Quantity"] * transaction["Price_per_Unit"]
            )
            transactions.append(transaction)
            txn_id += 1

    synthetic_df = pd.DataFrame(transactions)

    # --- Step 5: Build customer summary ---
    synthetic_customers = (
        synthetic_df.groupby(["Customer_ID", "Gender", "Age"])
        .agg(Transaction_Count=("Transaction_ID", "count"),
             Total_Spent=("Total_Amount", "sum"))
        .reset_index()
    )

    return synthetic_df, synthetic_customers


# Example usage
synthetic_df, synthetic_customers = generate_synthetic_data(num_customers=83, total_transactions=1000)

# --- Save to CSV ---
synthetic_df.to_csv("synthetic_dataset.csv", index=False)
synthetic_customers.to_csv("synthetic_customers.csv", index=False)

print("✅ Synthetic dataset saved as 'synthetic_dataset.csv'")
print("✅ Customer summary saved as 'synthetic_customers.csv'")


✅ Synthetic dataset saved as 'synthetic_dataset.csv'
✅ Customer summary saved as 'synthetic_customers.csv'


: 