In [1]:
# Install and verify libraries
import sys
import subprocess
import os
import importlib.metadata
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable GPU
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Suppress TensorFlow warnings

# Install required libraries
libraries = [
    "transformers==4.46.0",
    "torch",
    "pandas",
    "polars==1.7.1",
    "scipy",
    "azureml-core",
    "mltable==1.6.1",
    "marshmallow==3.21.3",  # Compatible with azure-ai-ml
    "azure-ai-ml==1.22.0",
    "ipywidgets>=7.0.0"
]

for lib in libraries:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", lib])
        print(f"Successfully installed {lib}")
    except subprocess.CalledProcessError as e:
        print(f"Error installing {lib}: {e}")  # Fixed f-string
        raise

# Verify installations
try:
    import transformers
    import torch
    import pandas
    import polars
    import scipy
    import azureml.core
    import mltable
    import marshmallow
    from azure.ai.ml import MLClient
    import ipywidgets
    print("Transformers:", transformers.__version__)
    print("Polars:", polars.__version__)
    print("PyTorch:", torch.__version__)
    print("Azure ML SDK:", azureml.core.__version__)
    print("Marshmallow:", importlib.metadata.version("marshmallow"))
    print("IPyWidgets:", ipywidgets.__version__)
    print("Python executable:", sys.executable)
except ImportError as e:
    print(f"Import error: {e}")
    raise

Collecting transformers==4.46.0
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers==4.46.0)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers==4.46.0)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers==4.46.0)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m157.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.7/781.7 kB[0m [31m267.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors

In [3]:
from azureml.core import Workspace, Datastore
import pandas as pd
import os

def download_and_load_csv(datastore, datastore_path, local_path=None):
    if local_path is None:
        local_path = datastore_path
    # Download the file if it doesn't exist locally
    if not os.path.exists(local_path):
        print(f"Downloading {datastore_path} to {local_path} ...")
        datastore.download(target_path=".", prefix=datastore_path, overwrite=True)
    else:
        print(f"{local_path} already exists locally.")
    # Load into DataFrame
    df = pd.read_csv(local_path)
    print(f"Loaded {local_path}: shape {df.shape}")
    return df

# Connect to workspace
ws = Workspace.from_config()
datastore = ws.get_default_datastore()

# Example usage for events.csv
events_df = download_and_load_csv(datastore, "UI/2025-05-31_184443_UTC/events.csv")

# Example usage for train.csv (Amazon product data)
amazon_df = download_and_load_csv(datastore, "UI/2025-06-05_174812_UTC/train.csv")


UI/2025-05-31_184443_UTC/events.csv already exists locally.
Loaded UI/2025-05-31_184443_UTC/events.csv: shape (2756101, 5)
UI/2025-06-05_174812_UTC/train.csv already exists locally.
Loaded UI/2025-06-05_174812_UTC/train.csv: shape (2249698, 6)


In [4]:
# List of (datastore_path, local_filename)
datasets = [
    ("UI/2025-05-31_184443_UTC/events.csv", "events_df"),
    ("UI/2025-06-05_174812_UTC/train.csv", "amazon_df"),
    ("UI/2025-06-05_185930_UTC/item_properties_part1.csv", "item_properties1_df"),
    ("UI/2025-06-05_190018_UTC/item_properties_part2.csv", "item_properties2_df"),
    ("UI/2025-06-05_190107_UTC/category_tree.csv", "category_tree_df")
]

# Dictionary to store each DataFrame
dataframes = {}

for datastore_path, df_name in datasets:
    df = download_and_load_csv(datastore, datastore_path)
    dataframes[df_name] = df

# Unpack for easy access
events_df = dataframes["events_df"]
amazon_df = dataframes["amazon_df"]
item_properties1_df = dataframes["item_properties1_df"]
item_properties2_df = dataframes["item_properties2_df"]
category_tree_df = dataframes["category_tree_df"]

# Quick check of shapes
print("events_df:", events_df.shape)
print("amazon_df:", amazon_df.shape)
print("item_properties1_df:", item_properties1_df.shape)
print("item_properties2_df:", item_properties2_df.shape)
print("category_tree_df:", category_tree_df.shape)


UI/2025-05-31_184443_UTC/events.csv already exists locally.
Loaded UI/2025-05-31_184443_UTC/events.csv: shape (2756101, 5)
UI/2025-06-05_174812_UTC/train.csv already exists locally.
Loaded UI/2025-06-05_174812_UTC/train.csv: shape (2249698, 6)
Downloading UI/2025-06-05_185930_UTC/item_properties_part1.csv to UI/2025-06-05_185930_UTC/item_properties_part1.csv ...
Downloading UI/2025-06-05_185930_UTC/item_properties_part1.csv
Downloaded UI/2025-06-05_185930_UTC/item_properties_part1.csv, 1 files out of an estimated total of 1
Loaded UI/2025-06-05_185930_UTC/item_properties_part1.csv: shape (10999999, 4)
Downloading UI/2025-06-05_190018_UTC/item_properties_part2.csv to UI/2025-06-05_190018_UTC/item_properties_part2.csv ...
Downloading UI/2025-06-05_190018_UTC/item_properties_part2.csv
Downloaded UI/2025-06-05_190018_UTC/item_properties_part2.csv, 1 files out of an estimated total of 1
Loaded UI/2025-06-05_190018_UTC/item_properties_part2.csv: shape (9275903, 4)
Downloading UI/2025-06-05_1

In [5]:
# Filter for "view" events and save a sample of 50,000
filtered_events_df = events_df[events_df['event'] == 'view'].head(50000)
filtered_events_df.to_csv("filtered_events.csv", index=False)
print("filtered_events.csv:", filtered_events_df.shape)
print(filtered_events_df.head())


filtered_events.csv: (50000, 5)
       timestamp  visitorid event  itemid  transactionid
0  1433221332117     257597  view  355908            NaN
1  1433224214164     992329  view  248676            NaN
2  1433221999827     111016  view  318965            NaN
3  1433221955914     483717  view  253185            NaN
4  1433221337106     951259  view  367447            NaN


In [6]:
print("amazon_df columns:", amazon_df.columns.tolist())
print(amazon_df.head())


amazon_df columns: ['PRODUCT_ID', 'TITLE', 'BULLET_POINTS', 'DESCRIPTION', 'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH']
   PRODUCT_ID                                              TITLE  \
0     1925202  ArtzFolio Tulip Flowers Blackout Curtain for D...   
1     2673191  Marks & Spencer Girls' Pyjama Sets T86_2561C_N...   
2     2765088  PRIKNIK Horn Red Electric Air Horn Compressor ...   
3     1594019  ALISHAH Women's Cotton Ankle Length Leggings C...   
4      283658  The United Empire Loyalists: A Chronicle of th...   

                                       BULLET_POINTS  \
0  [LUXURIOUS & APPEALING: Beautiful custom-made ...   
1  [Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...   
2  [Loud Dual Tone Trumpet Horn, Compatible With ...   
3  [Made By 95%cotton and 5% Lycra which gives yo...   
4                                                NaN   

                                         DESCRIPTION  PRODUCT_TYPE_ID  \
0                                                NaN             1650

In [7]:
# Filter for products with both title and description
filtered_amazon_df = amazon_df[
    amazon_df['TITLE'].notnull() & amazon_df['DESCRIPTION'].notnull()
].head(1000)

filtered_amazon_df.to_csv("filtered_amazon_products.csv", index=False)
print("filtered_amazon_products.csv:", filtered_amazon_df.shape)
print(filtered_amazon_df[['PRODUCT_ID', 'TITLE', 'DESCRIPTION']].head())
print("Unique PRODUCT_TYPE_IDs:", filtered_amazon_df['PRODUCT_TYPE_ID'].unique())


filtered_amazon_products.csv: (1000, 6)
   PRODUCT_ID                                              TITLE  \
2     2765088  PRIKNIK Horn Red Electric Air Horn Compressor ...   
3     1594019  ALISHAH Women's Cotton Ankle Length Leggings C...   
5     2152929  HINS Metal Bucket Shape Plant Pot for Indoor &...   
7     2026580  Delavala Self Adhesive Kitchen Backsplash Wall...   
9     2998633  Hexwell Essential oil for Home Fragrance Oil A...   

                                         DESCRIPTION  
2  Specifications: Color: Red, Material: Aluminiu...  
3  AISHAH Women's Lycra Cotton Ankel Leggings. Br...  
5  HINS Brings you the most Elegant Looking Pot w...  
7  <p><strong>Aluminum Foil Stickers-good kitchen...  
9  Transform your home, workplace or hotel room i...  
Unique PRODUCT_TYPE_IDs: [ 7537  2996  5725  6030  8201 10359  1273   716 12556  1582  3357  2879
   976  5565  2136  1626  2986  2992  5091  2201  3079  1419  2788 11192
  1557  2640   646  1725  1252 11395  2213 12680  

In [8]:
# Get the top 1,000 most viewed itemids from filtered_events_df
top_items = (
    filtered_events_df['itemid']
    .value_counts()
    .head(1000)
    .index
    .tolist()
)

# Map each top RetailRocket itemid to an Amazon PRODUCT_ID
amazon_product_ids = filtered_amazon_df['PRODUCT_ID'].tolist()
synthetic_mapping = dict(zip(top_items, amazon_product_ids))

print(f"Synthetic mapping created for {len(synthetic_mapping)} items.")
# Example: print first 5 mappings
for i, (retail_id, amazon_id) in enumerate(synthetic_mapping.items()):
    print(f"RetailRocket itemid {retail_id} -> Amazon PRODUCT_ID {amazon_id}")
    if i >= 4:
        break


Synthetic mapping created for 1000 items.
RetailRocket itemid 370653 -> Amazon PRODUCT_ID 2765088
RetailRocket itemid 298009 -> Amazon PRODUCT_ID 1594019
RetailRocket itemid 335975 -> Amazon PRODUCT_ID 2152929
RetailRocket itemid 315543 -> Amazon PRODUCT_ID 2026580
RetailRocket itemid 355994 -> Amazon PRODUCT_ID 2998633


In [9]:
# Build a DataFrame for ad generation
import pandas as pd

ad_input_df = pd.DataFrame({
    'itemid': list(synthetic_mapping.keys()),
    'PRODUCT_ID': list(synthetic_mapping.values())
})

# Merge to get TITLE and DESCRIPTION for each mapped item
ad_input_df = ad_input_df.merge(
    filtered_amazon_df[['PRODUCT_ID', 'TITLE', 'DESCRIPTION']],
    on='PRODUCT_ID',
    how='left'
)

print(ad_input_df.head())


   itemid  PRODUCT_ID                                              TITLE  \
0  370653     2765088  PRIKNIK Horn Red Electric Air Horn Compressor ...   
1  298009     1594019  ALISHAH Women's Cotton Ankle Length Leggings C...   
2  335975     2152929  HINS Metal Bucket Shape Plant Pot for Indoor &...   
3  315543     2026580  Delavala Self Adhesive Kitchen Backsplash Wall...   
4  355994     2998633  Hexwell Essential oil for Home Fragrance Oil A...   

                                         DESCRIPTION  
0  Specifications: Color: Red, Material: Aluminiu...  
1  AISHAH Women's Lycra Cotton Ankel Leggings. Br...  
2  HINS Brings you the most Elegant Looking Pot w...  
3  <p><strong>Aluminum Foil Stickers-good kitchen...  
4  Transform your home, workplace or hotel room i...  


In [11]:
# If you haven't already in your environment:
!pip install torch transformers




In [12]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, set_seed

# Set seed for reproducibility
set_seed(42)

# Load pre-trained GPT-2 tokenizer and model (PyTorch backend)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()  # Set model to evaluation mode

if torch.cuda.is_available():
    model = model.to('cuda')

def generate_ad(title, description, max_length=50):
    prompt = f"Product: {title}\nDescription: {description}\nAd: "
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
    outputs = model.generate(
        inputs,
        max_length=len(inputs[0]) + max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the generated ad text after the prompt
    ad_text = generated[len(prompt):].strip()
    return ad_text

# Example: Generate ads for first 5 items
sample = ad_input_df.head(5).copy()
sample['ad_text'] = sample.apply(
    lambda row: generate_ad(row['TITLE'], row['DESCRIPTION']), axis=1
)
print(sample[['itemid', 'TITLE', 'ad_text']])


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


   itemid                                              TITLE  \
0  370653  PRIKNIK Horn Red Electric Air Horn Compressor ...   
1  298009  ALISHAH Women's Cotton Ankle Length Leggings C...   
2  335975  HINS Metal Bucket Shape Plant Pot for Indoor &...   
3  315543  Delavala Self Adhesive Kitchen Backsplash Wall...   
4  355994  Hexwell Essential oil for Home Fragrance Oil A...   

                                             ad_text  
0  ___________\n(Note: This product is a complete...  
1  © 2014 B&H. All Rights Reserved.\nCopyright © ...  
2  www.hins.co.uk/ad/products/colours/Plant-Stand...  
3  ______________\nBrand: delavallive\n\nProduct ...  
4  Charmo, Naturals, Pomegranate, Chamomile, Viol...  
