In [1]:
import pandas as pd
import uuid
import random
from datetime import datetime
from tqdm import tqdm

In [2]:
OUTPUT_FILE_CLEAN = '../dataset/master_data_cleaned.csv'

In [3]:
df = pd.read_csv(OUTPUT_FILE_CLEAN)
df.columns = [c.strip().lower().replace(" ", "_").replace("url", "_url") for c in df.columns]


In [4]:
brand_groups = {
    "luggage": [
        "samsonite", "tumi", "american_tourister", "delsey", "rimowa", "travelpro",
        "it luggage", "rockland", "victorinox", "briggs & riley", "antler", "away", "calpak",
        "hartmann", "kensington", "eastpak", "kipling", "anello", "targus", "osprey", "deuter"
    ],
    "apparel": [
        "woven", "uniqlo", "zara", "h&m", "bershka", "mango", "superdry", "jack & jones", "everlane",
        "cotton on", "gap", "dockers", "levi's", "lee", "wrangler", "muji", "patagonia", "north face",
        "columbia", "timberland", "reebok", "nike", "puma", "adidas", "under armour", "fila"
    ],
    "electronics": [
        "xiaomi", "baseus", "anker", "ugreen", "ravpower", "spigen", "orico",
        "lenovo", "dell", "hp", "asus", "acer", "msi", "logitech", "razer",
        "steelseries", "samsung", "sony", "lg", "philips", "jbl", "bose"
    ],
    "accessories": [
        "herschel", "jansport", "bellroy", "topo designs", "porter", "chrome", "incase",
        "nomatic", "peak design", "crumpler", "hobo", "matador", "gregory", "poler", "aer"
    ],
    "home": [
        "ikea", "muji", "oxo", "rubbermaid", "dyson", "philips", "morphy richards",
        "panasonic", "electrolux", "toshiba", "sharp", "black+decker", "bosch", "smeg"
    ]
}

In [5]:
prefixes = ["Nord", "Tech", "Vero", "Luma", "Aero", "Wov", "Trek", "Neo", "Zen", "Omni", "Lug", "Sol", "Vent"]
suffixes = ["ora", "max", "vox", "line", "tone", "tek", "lux", "form", "nova", "core", "sync", "blend", "craft"]

def generate_fake_brand():
    return random.choice(prefixes) + random.choice(suffixes)

def select_brand(category_text: str) -> str:
    cat = category_text.lower()
    if "luggage" in cat or "travel" in cat or "bag" in cat:
        pool = brand_groups["luggage"]
    elif any(word in cat for word in ["shirt", "clothing", "apparel", "pants", "jacket", "flannel", "fashion"]):
        pool = brand_groups["apparel"]
    elif any(word in cat for word in ["electronic", "gadget", "tech", "laptop", "headphone", "charger", "device"]):
        pool = brand_groups["electronics"]
    elif any(word in cat for word in ["accessory", "wallet", "watch", "backpack", "belt", "case"]):
        pool = brand_groups["accessories"]
    elif any(word in cat for word in ["home", "kitchen", "household", "living", "cook"]):
        pool = brand_groups["home"]
    else:
        return generate_fake_brand()
    return random.choice(pool)

In [6]:
def extract_category(title: str) -> str:
    """Ambil 2 kata pertama dari title sebagai kategori sementara."""
    words = title.split()
    return " ".join(words[:2]).lower() if words else "unknown"

def generate_description(title: str) -> str:
    """Buat deskripsi sederhana dari title."""
    return f"{title.capitalize()} — produk berkualitas tinggi dengan desain modern dan fungsional."

def generate_tags(title: str) -> list:
    """Ambil hingga 5 kata unik dari title sebagai tag."""
    tags = list(dict.fromkeys(title.lower().split()))[:5]
    return tags

In [7]:
augmented_rows = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    title_text = str(row.get("title", row.get("name", ""))).strip()
    category_text = extract_category(title_text)
    augmented_rows.append({
        "product_id": str(uuid.uuid4()),
        "name": title_text.capitalize(),
        "description": generate_description(title_text),
        "stock": random.randint(20, 100),
        "brand": select_brand(category_text),
        "category_ml": category_text,
        "creation_date": datetime(2024, 10, 10, 9, 0, 0).isoformat() + "Z",
        "last_updated": datetime(2025, 10, 12, 9, 0, 0).isoformat() + "Z",
        "search_tags": generate_tags(title_text)
    })

aug_df = pd.DataFrame(augmented_rows)

100%|██████████| 3561722/3561722 [16:22<00:00, 3624.02it/s] 


In [8]:
merged_df = pd.concat([df, aug_df], axis=1)

if "title" in merged_df.columns:
    merged_df.drop(columns=["title"], inplace=True)

cols_order = [
    "asin", "product_id", "name", "img_url", "product_url",
    "stars", "reviews", "price", "list_price",
    "is_best_seller", "bought_in_last_month",
    "description", "stock", "brand",
    "category_ml", "creation_date", "last_updated", "search_tags"
]

for c in cols_order:
    if c not in merged_df.columns:
        merged_df[c] = None

merged_df = merged_df[cols_order]


In [9]:
OUTPUT_JSON = "merged_augmented_products_snakecase.json"
merged_df.to_json(OUTPUT_JSON, orient="records", indent=2, force_ascii=False)

In [11]:
OUTPUT_CSV = "merged_augmented_products_snakecase.csv"
merged_df.to_csv(OUTPUT_CSV, index=False)

In [12]:
merged_df.head()

Unnamed: 0,asin,product_id,name,img_url,product_url,stars,reviews,price,list_price,is_best_seller,bought_in_last_month,description,stock,brand,category_ml,creation_date,last_updated,search_tags
0,B014TMV5YE,35afa875-c006-4d20-a36a-0287b1178f3a,"Sion softside expandable roller luggage, black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,,,,"Sion softside expandable roller luggage, black...",73,Nordcore,sion softside,2024-10-10T09:00:00Z,2025-10-12T09:00:00Z,"[sion, softside, expandable, roller, luggage,]"
1,B07GDLCQXV,e2a4dc14-6c44-47aa-8b99-c8336f572239,Luggage sets expandable pc+abs durable suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,,,,Luggage sets expandable pc+abs durable suitcas...,38,osprey,luggage sets,2024-10-10T09:00:00Z,2025-10-12T09:00:00Z,"[luggage, sets, expandable, pc+abs, durable]"
2,B07XSCCZYG,061ee432-c172-4097-b368-1b8c8396d95e,Platinum elite softside expandable checked lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,,,,Platinum elite softside expandable checked lug...,23,Ventsync,platinum elite,2024-10-10T09:00:00Z,2025-10-12T09:00:00Z,"[platinum, elite, softside, expandable, checked]"
3,B08MVFKGJM,df37b0fc-b410-4a6d-ba2a-98ae5014e573,Freeform hardside expandable with double spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,,,,Freeform hardside expandable with double spinn...,37,Solmax,freeform hardside,2024-10-10T09:00:00Z,2025-10-12T09:00:00Z,"[freeform, hardside, expandable, with, double]"
4,B01DJLKZBA,0e736f57-ebc1-4d4d-9054-d2d8718a0e81,Winfield 2 hardside expandable luggage with sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,,,,Winfield 2 hardside expandable luggage with sp...,68,Aerotone,winfield 2,2024-10-10T09:00:00Z,2025-10-12T09:00:00Z,"[winfield, 2, hardside, expandable, luggage]"
