
### Notebook Summary

- **Generated synthetic customer data** with realistic attributes and saved as CSV files.
- **Created a diverse product catalog** with multiple categories, brands, and variants, and exported product data.
- **Simulated live sales events** including customer interactions (view, add to cart, purchase) and multi-product orders, saving results as streaming sales data.
- **Loaded and displayed data** using Spark for further analysis and visualization.

In [0]:

%pip install faker
from faker import Faker
import pandas as pd
import random, os, glob
from datetime import datetime

fake = Faker("en_IN")

customer_path = "/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/customers/"
os.makedirs(customer_path, exist_ok=True)

files = glob.glob(customer_path + "/*.csv")
if files:
    df_old = pd.concat([pd.read_csv(f) for f in files])
    start_id = df_old["CustomerID"].max() + 1
else:
    start_id = 1

locations = [
    "Bengaluru","Mumbai","Delhi","Hyderabad","Chennai","Pune","Kolkata",
    "Ahmedabad","Jaipur","Noida","Gurgaon","Kochi","Mysuru"
]

payment_methods = ["UPI","Credit Card","Debit Card","Wallet"]
frequency = ["Low","Medium","High"]
seasons = ["Winter","Summer","Monsoon"]

new_customers = 5000

rows = []
for i in range(new_customers):
    rows.append({
        "CustomerID": start_id + i,
        "CustomerName": fake.name(),
        "ContactNumber": str(random.randint(6000000000, 9999999999)),
        "Age": random.randint(18, 65),
        "Gender": random.choice(["Male","Female"]),
        "Location": random.choice(locations),
        "SubscriptionStatus": random.choice(["Prime","Regular"]),
        "PaymentMethod": random.choice(payment_methods),
        "PreviousPurchases": random.randint(0, 50),
        "FrequencyOfPurchases": random.choice(frequency),
        "PreferredSeason": random.choice(seasons),
        "AvgReviewRating": round(random.uniform(2.5, 5.0), 1)
    })

df = pd.DataFrame(rows)
file_name = f"customers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(os.path.join(customer_path, file_name), index=False)

print("‚úÖ Customers generated:", len(df))


In [0]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/customers")
display(df)

In [0]:
# ============================================================
# PRODUCT DATA GENERATOR (ONLINE-ML SAFE, FINAL STABLE VERSION)
# ============================================================

from faker import Faker
import pandas as pd
import random, os, glob
from datetime import datetime

fake = Faker()

# ------------------------------------------------------------
# PATH
# ------------------------------------------------------------
product_path = "/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/products/"
os.makedirs(product_path, exist_ok=True)

# ------------------------------------------------------------
# LOAD EXISTING PRODUCTS
# ------------------------------------------------------------
files = glob.glob(product_path + "*.csv")
if files:
    existing = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    start_id = int(existing["ProductID"].max()) + 1
else:
    existing = pd.DataFrame()
    start_id = 1

# ------------------------------------------------------------
# EXTENDED PRODUCT CATALOG
# ------------------------------------------------------------
product_catalog = {

    "Smartphone": {
        "Apple": ["iPhone 13", "iPhone 14", "iPhone 15 Pro"],
        "Samsung": ["Galaxy S21", "Galaxy S22 Ultra", "Galaxy A54"],
        "OnePlus": ["OnePlus 10R", "OnePlus 11", "Nord CE 3"],
        "Mi": ["Redmi Note 12 Pro", "Mi 11X"],
        "Realme": ["GT Neo 3", "Narzo 60"]
    },

    "Laptop": {
        "Dell": ["Inspiron 15", "XPS 13", "Latitude 7420"],
        "HP": ["Pavilion 14", "Victus Gaming", "Envy x360"],
        "Lenovo": ["ThinkPad E14", "IdeaPad Slim 5", "Legion 5"],
        "Apple": ["MacBook Air M1", "MacBook Pro M2"],
        "Asus": ["Vivobook 15", "ROG Strix G15"]
    },

    "Tablet": {
        "Apple": ["iPad 9th Gen", "iPad Air"],
        "Samsung": ["Galaxy Tab S8", "Galaxy Tab A8"],
        "Lenovo": ["Tab P11", "Yoga Tab"]
    },

    "Smartwatch": {
        "Apple": ["Watch Series 8", "Watch SE"],
        "Samsung": ["Galaxy Watch 5"],
        "Noise": ["ColorFit Pro", "Evolve 3"],
        "Boat": ["Xtend", "Wave Pro"]
    },

    "Earbuds": {
        "Boat": ["Airdopes 141", "Airdopes 441 Pro"],
        "Sony": ["WF-1000XM4"],
        "Samsung": ["Galaxy Buds 2 Pro"],
        "OnePlus": ["Nord Buds 2"]
    },

    "Footwear": {
        "Nike": ["Air Max 90", "Revolution 6", "Downshifter 12"],
        "Adidas": ["Ultraboost 22", "Run Falcon 3"],
        "Puma": ["RS-X", "Smash V2"],
        "Reebok": ["Club C 85", "Nano X2"]
    },

    "Clothing": {
        "Zara": ["Slim Fit Shirt", "Casual Jacket", "Formal Blazer"],
        "H&M": ["Cotton T-Shirt", "Denim Jeans", "Hoodie"],
        "Levis": ["511 Slim Jeans", "512 Tapered", "Denim Jacket"],
        "Roadster": ["Checked Shirt", "Graphic Tee", "Sweatshirt"]
    },

    "Home & Kitchen": {
        "Prestige": ["Induction Cooktop", "Pressure Cooker", "Gas Stove"],
        "Philips": ["Air Fryer", "Mixer Grinder", "Coffee Maker"],
        "Bajaj": ["Rex Mixer", "Electric Kettle"],
        "LG": ["Microwave Oven", "Refrigerator"]
    },

    "Furniture": {
        "Ikea": ["Study Table", "Office Chair", "Bookshelf"],
        "Durian": ["Sofa Set", "Recliner"],
        "HomeTown": ["Queen Bed", "Wardrobe"]
    },

    "Beauty": {
        "Lakme": ["9to5 Foundation", "Cushion Matte Lipstick"],
        "Loreal": ["Revitalift Cream", "Total Repair Shampoo"],
        "Nivea": ["Body Milk Lotion", "Men Face Wash"]
    },

    "Sports": {
        "Adidas": ["Football Shoes", "Training Jersey"],
        "SG": ["Cricket Bat", "Batting Gloves"],
        "Yonex": ["Astrox Racket", "Mavis Shuttlecock"],
        "Decathlon": ["Yoga Mat", "Dumbbell Set"]
    },

    "Fitness Equipment": {
        "Decathlon": ["Treadmill", "Exercise Cycle"],
        "Cockatoo": ["Resistance Bands", "Pull-Up Bar"]
    }
}

# ------------------------------------------------------------
# VARIANT MAPS
# ------------------------------------------------------------
COLOR_MAP = {
    "Smartphone": ["Black", "White", "Blue", "Green", "Gold"],
    "Laptop": ["Silver", "Gray", "Black"],
    "Tablet": ["Silver", "Gray"],
    "Smartwatch": ["Black", "Silver", "Gold"],
    "Earbuds": ["Black", "White", "Blue"],
    "Footwear": ["Black", "White", "Red", "Blue"],
    "Clothing": ["Black", "White", "Blue", "Green"],
    "Sports": ["Black", "Red", "Blue"]
}

SIZE_MAP = {
    "Footwear": ["6", "7", "8", "9", "10", "11"],
    "Clothing": ["XS", "S", "M", "L", "XL", "XXL"]
}

STORAGE_MAP = {
    "Smartphone": ["64GB", "128GB", "256GB", "512GB"]
}

# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
PRODUCTS_PER_RUN = 600
UPDATE_OLD_PRODUCTS = 150

rows = []

# ------------------------------------------------------------
# NEW PRODUCTS
# ------------------------------------------------------------
for i in range(PRODUCTS_PER_RUN):

    category = random.choice(list(product_catalog.keys()))
    brand = random.choice(list(product_catalog[category].keys()))
    product_name = random.choice(product_catalog[category][brand])

    mrp = round(random.uniform(800, 120000), 2)
    price = round(random.uniform(mrp * 0.6, mrp), 2)

    colors = (
        ",".join(
            random.sample(
                COLOR_MAP[category],
                random.randint(1, min(3, len(COLOR_MAP[category])))
            )
        )
        if category in COLOR_MAP else None
    )

    sizes = (
        ",".join(
            random.sample(
                SIZE_MAP[category],
                random.randint(2, min(4, len(SIZE_MAP[category])))
            )
        )
        if category in SIZE_MAP else None
    )

    storage = (
        ",".join(
            random.sample(
                STORAGE_MAP["Smartphone"],
                random.randint(1, min(2, len(STORAGE_MAP["Smartphone"])))
            )
        )
        if category == "Smartphone" else None
    )

    rows.append({
        "ProductID": start_id + i,
        "ProductName": product_name,
        "Category": category,
        "Brand": brand,
        "AvailableColors": colors,
        "AvailableSizes": sizes,
        "AvailableStorage": storage,
        "MRP": mrp,
        "Price": price,
        "DiscountPercent": round((mrp - price) / mrp * 100, 2),
        "Stock": random.randint(20, 500),
        "Rating": round(random.uniform(3.0, 5.0), 1),
        "ReviewsCount": random.randint(10, 10000),
        "LastUpdated": datetime.now()
    })

# ------------------------------------------------------------
# UPDATE EXISTING PRODUCTS (SCD-LIKE)
# ------------------------------------------------------------
if not existing.empty:
    updates = existing.sample(min(UPDATE_OLD_PRODUCTS, len(existing)))

    for _, row in updates.iterrows():
        row["Price"] = round(row["Price"] * random.uniform(0.9, 1.1), 2)
        row["DiscountPercent"] = round(
            (row["MRP"] - row["Price"]) / row["MRP"] * 100, 2
        )
        row["Stock"] = max(0, row["Stock"] + random.randint(-50, 200))
        row["LastUpdated"] = datetime.now()

        rows.append(row.to_dict())

# ------------------------------------------------------------
# WRITE FILE
# ------------------------------------------------------------
df = pd.DataFrame(rows)
file_name = f"products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(product_path + file_name, index=False)

print(f"‚úÖ Products written: {len(df)}")
print(f"üìÅ File saved as: {file_name}")


In [0]:
df_products = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/products")
display(df_products)

In [0]:
# ============================================================
# SALES STREAMING DATA GENERATOR (ONLINE ML + FBT SAFE)
# EventTime = current_timestamp()
# ============================================================

%pip install faker
import pandas as pd
import random, os, time, glob
from datetime import datetime
from faker import Faker

fake = Faker("en_IN")

# ------------------------------------------------------------
# PATHS (UNCHANGED)
# ------------------------------------------------------------
customer_path = "/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/customers/"
product_path  = "/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/products/"
sales_path    = "/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/sales/"

os.makedirs(sales_path, exist_ok=True)

# ------------------------------------------------------------
# LOAD MASTER DATA
# ------------------------------------------------------------
customers = pd.concat([pd.read_csv(f) for f in glob.glob(customer_path + "*.csv")])
products  = pd.concat([pd.read_csv(f) for f in glob.glob(product_path + "*.csv")])

# ------------------------------------------------------------
# USER TYPE LOGIC (UNCHANGED)
# ------------------------------------------------------------
def user_type(freq):
    if freq == "Weekly":
        return "power"
    elif freq == "Monthly":
        return "active"
    return "casual"

# ------------------------------------------------------------
# SEASON FROM CURRENT TIME
# ------------------------------------------------------------
def current_season():
    m = datetime.now().month
    if m in [11, 12, 1]:
        return "Winter"
    elif m in [3, 4, 5]:
        return "Summer"
    elif m in [6, 7, 8]:
        return "Monsoon"
    return "Festive"

# ------------------------------------------------------------
# STREAM CONFIG
# ------------------------------------------------------------
BATCH_SIZE = 3000
SLEEP_SEC = 5

# ------------------------------------------------------------
# STREAM LOOP
# ------------------------------------------------------------
while True:

    rows = []
    season = current_season()
    event_time = datetime.now()   # üî• KEY CHANGE (ONLINE)

    events_created = 0

    while events_created < BATCH_SIZE:

        cust = customers.sample(1).iloc[0]
        utype = user_type(cust["FrequencyOfPurchases"])

        # Interaction probability
        if utype == "power":
            weights = [30, 30, 40]
        elif utype == "active":
            weights = [50, 30, 20]
        else:
            weights = [75, 20, 5]

        interaction = random.choices(
            ["view", "add_to_cart", "purchase"],
            weights
        )[0]

        # ----------------------------------------------------
        # NON-PURCHASE EVENTS (SINGLE PRODUCT)
        # ----------------------------------------------------
        if interaction != "purchase":

            prod = products.sample(1).iloc[0]

            rows.append({
                "OrderID": None,
                "CustomerID": cust["CustomerID"],
                "ProductID": prod["ProductID"],
                "ProductName": prod["ProductName"],
                "Category": prod["Category"],
                "Brand": prod["Brand"],
                "InteractionType": interaction,
                "Quantity": 1,
                "PriceAtPurchase": prod["Price"],
                "Season": season,
                "EventTime": event_time
            })

            events_created += 1

        # ----------------------------------------------------
        # PURCHASE EVENTS (TRUE FBT ‚Äì MULTI PRODUCT ORDER)
        # ----------------------------------------------------
        else:
            order_id = fake.uuid4()[:8]

            basket_size = random.choices(
                [1, 2, 3, 4],
                weights=[45, 30, 15, 10]
            )[0]

            basket_products = products.sample(basket_size)

            for _, prod in basket_products.iterrows():
                if events_created >= BATCH_SIZE:
                    break

                rows.append({
                    "OrderID": order_id,
                    "CustomerID": cust["CustomerID"],
                    "ProductID": prod["ProductID"],
                    "ProductName": prod["ProductName"],
                    "Category": prod["Category"],
                    "Brand": prod["Brand"],
                    "InteractionType": "purchase",
                    "Quantity": random.randint(1, 3),
                    "PriceAtPurchase": prod["Price"],
                    "Season": season,
                    "EventTime": event_time
                })

                events_created += 1

    # --------------------------------------------------------
    # WRITE FILE
    # --------------------------------------------------------
    df = pd.DataFrame(rows)
    fname = f"sales_live_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(os.path.join(sales_path, fname), index=False)

    print(f"üì¶ Generated {len(df)} LIVE sales events ‚Üí {fname}")
    time.sleep(SLEEP_SEC)


In [0]:
df_sales = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/Volumes/kusha_solutions/products_recommendation_online_ml/streaming_sales_data/sales").limit(1000)
display(df_sales)