In [11]:
import pandas as pd
import numpy as np
from pathlib import Path

# -------------------------------
# 1) Load reference CSV
# -------------------------------
csv_path = Path("E-Commerce_Dataset/retail_sales_dataset.csv")
sales_df = pd.read_csv(csv_path)

# -------------------------------
# 2) Define product categories & gender weights
# -------------------------------
category_weights = {
    "Male": {
        "Electronics": 0.4,
        "Clothing": 0.25,
        "Groceries": 0.15,
        "Beauty": 0.1,
        "Sports": 0.1
    },
    "Female": {
        "Electronics": 0.2,
        "Clothing": 0.3,
        "Groceries": 0.15,
        "Beauty": 0.25,
        "Sports": 0.1
    },
    "Other": {
        "Electronics": 0.25,
        "Clothing": 0.25,
        "Groceries": 0.25,
        "Beauty": 0.15,
        "Sports": 0.1
    }
}

# -------------------------------
# 3) Synthetic Data Generator
# -------------------------------
def generate_synthetic_data(num_customers=50, total_transactions=1000, alpha_val=0.3, seed=42):
    np.random.seed(seed)

    # Sample existing customers
    sampled_customers = sales_df[["Customer_ID", "Gender", "Age"]].drop_duplicates()

    # Generate synthetic IDs to reach exactly num_customers
    existing_ids = sampled_customers['Customer_ID'].tolist()
    genders = sampled_customers['Gender'].tolist()
    ages = sampled_customers['Age'].tolist()

    while len(existing_ids) < num_customers:
        new_id = f"CUST{np.random.randint(1000,9999)}"
        if new_id not in existing_ids:
            existing_ids.append(new_id)
            genders.append(np.random.choice(["Male","Female","Other"]))
            ages.append(np.random.randint(18, 65))

    # Keep only the requested number of users
    existing_ids = existing_ids[:num_customers]
    genders = genders[:num_customers]
    ages = ages[:num_customers]

    sampled_customers = pd.DataFrame({
        "Customer_ID": existing_ids,
        "Gender": genders,
        "Age": ages
    })

    # -------------------------------
    # Generate diverse transaction counts (Dirichlet)
    # -------------------------------
    alpha = np.ones(num_customers) * alpha_val
    proportions = np.random.dirichlet(alpha)
    trans_counts = (proportions * total_transactions).astype(int)

    # Ensure every customer has at least 1 transaction
    trans_counts = np.maximum(trans_counts, 1)

    # Adjust to match total_transactions exactly
    diff = total_transactions - trans_counts.sum()
    if diff > 0:
        for i in np.random.choice(num_customers, diff, replace=True):
            trans_counts[i] += 1
    elif diff < 0:
        for i in np.random.choice(num_customers, -diff, replace=True):
            trans_counts[i] -= 1

    # -------------------------------
    # Generate transaction rows
    # -------------------------------
    transactions = []
    txn_id = 1
    for i, row in sampled_customers.iterrows():
        customer_id, gender, age = row["Customer_ID"], row["Gender"], row["Age"]
        weights = category_weights.get(gender, category_weights["Other"])
        categories, probs = list(weights.keys()), list(weights.values())

        for _ in range(trans_counts[i]):
            quantity = np.random.randint(1, 6)
            price = np.random.randint(10, 1000)
            category = np.random.choice(categories, p=probs)
            transactions.append({
                "Transaction_ID": f"TXN{txn_id:05d}",
                "Date": pd.to_datetime("2022-01-01") + pd.to_timedelta(np.random.randint(0, 900), unit="D"),
                "Customer_ID": customer_id,
                "Gender": gender,
                "Age": age,
                "Product_Category": category,
                "Quantity": quantity,
                "Price_per_Unit": price,
                "Total_Amount": quantity * price
            })
            txn_id += 1

    df = pd.DataFrame(transactions)

    # -------------------------------
    # Disable scientific notation
    # -------------------------------
    pd.set_option('display.float_format', '{:,.0f}'.format)

    # -------------------------------
    # Data inspection
    # -------------------------------
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    print("\nFirst 5 rows:")
    display(df.head())

    print("\nData types and non-null counts:")
    print(df.info())

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print("\nDate parse failures (na after to_datetime):", df['date'].isna().sum())
    print("Date range:", df['date'].min(), "to", df['date'].max())

    numeric_cols = ['quantity','price_per_unit','total_amount','age']
    print("\nNumeric column summary:")
    display(df[numeric_cols].describe())

    print("\nMissing values by column:")
    print(df.isnull().sum().sort_values(ascending=False))

    print("\nFull-row duplicates:", df.duplicated().sum())
    print("Duplicate transaction_id count:", df['transaction_id'].duplicated().sum())

    print("\nUnique customers:", df['customer_id'].nunique())
    print("Total transactions:", len(df))
    print("Transactions per customer (top 5):")
    display(df.groupby('customer_id').size().sort_values(ascending=False).head())

    df['_computed_total'] = df['quantity'] * df['price_per_unit']
    mismatch_mask = (df['total_amount'] - df['_computed_total']).abs() > 1e-6
    print("\nRows where total_amount != quantity * price_per_unit (count):", mismatch_mask.sum())

    snapshot_date = df['date'].max() + pd.Timedelta(days=1)
    print("\nSnapshot Date:", snapshot_date)

    # -------------------------------
    # RFM computation
    # -------------------------------
    rfm = df.groupby('customer_id').agg({
        'date': lambda x: (snapshot_date - x.max()).days,
        'transaction_id': 'count',
        'total_amount': 'sum'
    }).reset_index()
    rfm.rename(columns={'date':'Recency','transaction_id':'Frequency','total_amount':'Monetary'}, inplace=True)

    print("\nRFM Table (first 5 rows):")
    display(rfm.head())

    print("\nRFM Summary:")
    display(rfm.describe())

    # -------------------------------
    # Save to CSV
    # -------------------------------
    df.to_csv("synthetic_dataset.csv", index=False)
    rfm.to_csv("synthetic_customers.csv", index=False)
    print("\n✅ Synthetic dataset saved as 'synthetic_dataset.csv'")
    print("✅ Customer summary saved as 'synthetic_customers.csv'")

    return df, rfm

# -------------------------------
# 4) User inputs
# -------------------------------
num_customers = int(input("Number of customers: "))
total_transactions = int(input("Total transactions: "))
alpha_val = float(input("Dirichlet alpha (0.1-0.5 recommended for diversity): "))

# -------------------------------
# 5) Generate synthetic dataset
# -------------------------------
df, rfm = generate_synthetic_data(
    num_customers=num_customers,
    total_transactions=total_transactions,
    alpha_val=alpha_val
)


Number of customers:  100
Total transactions:  10000
Dirichlet alpha (0.1-0.5 recommended for diversity):  0.3



First 5 rows:


Unnamed: 0,transaction_id,date,customer_id,gender,age,product_category,quantity,price_per_unit,total_amount
0,TXN00001,2023-03-29,CUST001,Male,34,Clothing,4,888,3552
1,TXN00002,2023-05-31,CUST001,Male,34,Clothing,4,838,3352
2,TXN00003,2022-01-17,CUST001,Male,34,Clothing,3,201,603
3,TXN00004,2022-02-15,CUST001,Male,34,Clothing,4,229,916
4,TXN00005,2024-01-15,CUST001,Male,34,Electronics,5,527,2635



Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    10000 non-null  object        
 1   date              10000 non-null  datetime64[ns]
 2   customer_id       10000 non-null  object        
 3   gender            10000 non-null  object        
 4   age               10000 non-null  int64         
 5   product_category  10000 non-null  object        
 6   quantity          10000 non-null  int64         
 7   price_per_unit    10000 non-null  int64         
 8   total_amount      10000 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 703.3+ KB
None

Date parse failures (na after to_datetime): 0
Date range: 2022-01-01 00:00:00 to 2024-06-18 00:00:00

Numeric column summary:


Unnamed: 0,quantity,price_per_unit,total_amount,age
count,10000,10000,10000,10000
mean,3,506,1523,41
std,1,287,1193,11
min,1,10,10,18
25%,2,258,561,34
50%,3,505,1202,42
75%,4,757,2265,49
max,5,999,4995,64



Missing values by column:
transaction_id      0
date                0
customer_id         0
gender              0
age                 0
product_category    0
quantity            0
price_per_unit      0
total_amount        0
dtype: int64

Full-row duplicates: 0
Duplicate transaction_id count: 0

Unique customers: 100
Total transactions: 10000
Transactions per customer (top 5):


customer_id
CUST023    943
CUST015    863
CUST064    579
CUST024    551
CUST082    530
dtype: int64


Rows where total_amount != quantity * price_per_unit (count): 0

Snapshot Date: 2024-06-19 00:00:00

RFM Table (first 5 rows):


Unnamed: 0,customer_id,Recency,Frequency,Monetary
0,CUST001,109,12,23498
1,CUST002,4,113,162206
2,CUST003,149,1,675
3,CUST004,238,1,2676
4,CUST005,1,58,77527



RFM Summary:


Unnamed: 0,Recency,Frequency,Monetary
count,100,100,100
mean,124,100,152314
std,197,178,270052
min,1,1,96
25%,4,2,2746
50%,20,12,20042
75%,149,110,158550
max,843,943,1390798



✅ Synthetic dataset saved as 'synthetic_dataset.csv'
✅ Customer summary saved as 'synthetic_customers.csv'
