In [1]:
import pandas as pd
df=pd.read_csv(r"../Data/cleaned_data.csv")


In [2]:
import pandas as pd
import numpy as np
import random

# 1. Generate synthetic user profiles
num_users = 1000
skin_types = ["Oily", "Dry", "Combination", "Sensitive", "Normal"]
user_profiles = []
for uid in range(num_users):
    user_profiles.append({
        "user_id": f"user_{uid:03}",
        "age": random.randint(18, 40),
        "skin_type": random.choice(skin_types)
    })
user_df = pd.DataFrame(user_profiles)

# 2. For each skin type, pick 10-15 top products as "best" and ensure most users interact with these
top_n = 12  # Number of "best" products per skintype
skintype_best_products = {}
for st in skin_types:
    possible = df[df['skintype_list'].apply(lambda lst: st in lst)]["product_id"].tolist()
    if len(possible) < top_n:
        sampled = possible  # fallback
    else:
        sampled = random.sample(possible, top_n)
    skintype_best_products[st] = sampled

# 3. Generate interactions: most users will interact/rate their "top" products, and some random "other" products
rows = []
for i, user in user_df.iterrows():
    user_id, user_skin = user["user_id"], user["skin_type"]
    # Main relevant products for user
    good_products = set(skintype_best_products[user_skin])
    # All products available for their skin type 
    matching_products = set(df[df['skintype_list'].apply(lambda lst: user_skin in lst)]["product_id"].tolist())

    # Each user interacts with most top-N for their type
    primary_rated = random.sample(list(good_products), k=max(7, len(good_products)-2))
    # ...and a few from the rest (include some repeats for density)
    secondary = random.sample(list(matching_products - good_products), k=min(5, len(matching_products - good_products))) if len(matching_products - good_products) > 5 else []
    
    # Strong positive (high score) for primary matches
    for pid in primary_rated:
        rows.append({
            "user_id": user_id,
            "product_id": pid,
            "interaction_type": "liked",
            "score": 5.0  # high rating insures overlap
        })
    # Medium score for secondary
    for pid in secondary:
        rows.append({
            "user_id": user_id,
            "product_id": pid,
            "interaction_type": "viewed",
            "score": 2.5
        })
    # Optionally, a few random "exploration" interactions (low ratings)
    for pid in random.sample(list(set(df["product_id"]) - matching_products), k=2):
        rows.append({
            "user_id": user_id,
            "product_id": pid,
            "interaction_type": "viewed",
            "score": 1.0
        })

# 4. Build DataFrames
synthetic_interactions = pd.DataFrame(rows)

# 5. Merge with user info (optional)
merged_df = synthetic_interactions.merge(user_df, on="user_id", how="left")

print("Synthetic data generated! Example:")
print(merged_df.head())


# --- 5. Save Files ---
merged_df.to_csv("../Data/merged_user_interaction_data.csv", index=False)
user_df.to_csv("../Data/synthetic_user_profiles.csv", index=False)

print("✅ Done: Merged interaction data created and saved.")







Synthetic data generated! Example:
    user_id product_id interaction_type  score  age skin_type
0  user_000   prod_725            liked    5.0   23      Oily
1  user_000    prod_59            liked    5.0   23      Oily
2  user_000    prod_45            liked    5.0   23      Oily
3  user_000   prod_920            liked    5.0   23      Oily
4  user_000    prod_46            liked    5.0   23      Oily
✅ Done: Merged interaction data created and saved.


In [4]:
merged_df.head(20)

Unnamed: 0,user_id,product_id,interaction_type,score,age,skin_type
0,user_000,prod_725,liked,5.0,23,Oily
1,user_000,prod_59,liked,5.0,23,Oily
2,user_000,prod_45,liked,5.0,23,Oily
3,user_000,prod_920,liked,5.0,23,Oily
4,user_000,prod_46,liked,5.0,23,Oily
5,user_000,prod_35,liked,5.0,23,Oily
6,user_000,prod_339,liked,5.0,23,Oily
7,user_000,prod_915,liked,5.0,23,Oily
8,user_000,prod_220,liked,5.0,23,Oily
9,user_000,prod_393,liked,5.0,23,Oily
