**Install Required Libraries**

In [1]:
# Install required libraries
import sys
import subprocess
import os

# Disable GPU to suppress CUDA errors
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# List of libraries to install
libraries = [
    "transformers==4.46.0",  # DistilBERT
    "torch",                # PyTorch
    "pandas",               # Data manipulation
    "scipy",                # A/B testing
    "azureml-core"          # Azure ML workspace
]

# Install libraries
for lib in libraries:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", lib])
        print(f"Successfully installed {lib}")
    except subprocess.CalledProcessError as e:
        print(f"Error installing {lib}: {e}")
        raise

# Verify installations
try:
    import transformers
    import torch
    import pandas
    import scipy
    import azureml.core
    print("Transformers version:", transformers.__version__)
    print("PyTorch version:", torch.__version__)
    print("Pandas version:", pandas.__version__)
    print("Scipy version:", scipy.__version__)
    print("AzureML Core version:", azureml.core.__version__)
    print("Python executable:", sys.executable)
except ImportError as e:
    print(f"Import error: {e}")
    raise

Successfully installed transformers==4.46.0
Successfully installed torch
Successfully installed pandas
Successfully installed scipy
Transformers version: 4.46.0
PyTorch version: 2.6.0+cu124
Pandas version: 2.2.3
Scipy version: 1.15.2
AzureML Core version: 1.60.0
Python executable: /anaconda/envs/azureml_py38_PT_TF/bin/python


  from .autonotebook import tqdm as notebook_tqdm


**Load Dataset from Datastore**

In [2]:
# Load dataset from datastore
from azureml.core import Workspace, Datastore
import pandas as pd

# Set random seed for reproducibility
import numpy as np
np.random.seed(42)

# Connect to workspace
try:
    ws = Workspace.from_config()
    print("Connected to workspace:", ws.name)
except Exception as e:
    print(f"Error connecting to workspace: {e}")
    raise

# Access default datastore
datastore = ws.get_default_datastore()
datastore_path = "UI/2025-05-31_184443_UTC/events.csv"

# Download dataset
try:
    datastore.download(target_path=".", prefix=datastore_path, overwrite=True)
    print("Downloaded events.csv")
except Exception as e:
    print(f"Error downloading dataset: {e}")
    raise

# Load into DataFrame
file_path = "UI/2025-05-31_184443_UTC/events.csv"
try:
    retailrocket_data = pd.read_csv(file_path)
    print("Dataset loaded successfully")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Verify dataset
print("Dataset shape:", retailrocket_data.shape)
print("Columns:", retailrocket_data.columns.tolist())
print("First 5 rows:\n", retailrocket_data.head())

Connected to workspace: pbhavani-mlworkspace
Downloading UI/2025-05-31_184443_UTC/events.csv
Downloaded UI/2025-05-31_184443_UTC/events.csv, 1 files out of an estimated total of 1
Downloaded events.csv
Dataset loaded successfully
Dataset shape: (2756101, 5)
Columns: ['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']
First 5 rows:
        timestamp  visitorid event  itemid  transactionid
0  1433221332117     257597  view  355908            NaN
1  1433224214164     992329  view  248676            NaN
2  1433221999827     111016  view  318965            NaN
3  1433221955914     483717  view  253185            NaN
4  1433221337106     951259  view  367447            NaN


**Preprocess the Dataset**

In [3]:
# Preprocess dataset
import numpy as np
np.random.seed(42)  # Ensure reproducibility

# Filter view events and select columns
filtered_data = retailrocket_data[retailrocket_data["event"] == "view"][["visitorid", "itemid", "timestamp"]]

# Remove missing values
filtered_data = filtered_data.dropna()

# Create subset of 50,000 records
subset_data = filtered_data.head(50000).copy()

# Verify preprocessing
print("Unique event types in original data:", retailrocket_data["event"].unique())
print("Subset shape:", subset_data.shape)
print("Subset columns:", subset_data.columns.tolist())
print("First 5 rows of subset:\n", subset_data.head())

Unique event types in original data: ['view' 'addtocart' 'transaction']
Subset shape: (50000, 3)
Subset columns: ['visitorid', 'itemid', 'timestamp']
First 5 rows of subset:
    visitorid  itemid      timestamp
0     257597  355908  1433221332117
1     992329  248676  1433224214164
2     111016  318965  1433221999827
3     483717  253185  1433221955914
4     951259  367447  1433221337106


**Create Synthetic Ad Text**

In [4]:
# Create synthetic ad text
import numpy as np
np.random.seed(42)  # Ensure reproducibility

# Add ad_text column
subset_data["ad_text"] = subset_data["itemid"].apply(lambda x: f"Discover item {x} now!")

# Select 5,000 records for training
train_data = subset_data[["ad_text"]].head(5000)

# Save to CSV
train_data.to_csv("train_ad_text.csv", index=False)

# Verify training data
print("Training data shape:", train_data.shape)
print("Training columns:", train_data.columns.tolist())
print("First 5 rows of training data:\n", train_data.head())
print("\nSaved as train_ad_text.csv")

Training data shape: (5000, 1)
Training columns: ['ad_text']
First 5 rows of training data:
                      ad_text
0  Discover item 355908 now!
1  Discover item 248676 now!
2  Discover item 318965 now!
3  Discover item 253185 now!
4  Discover item 367447 now!

Saved as train_ad_text.csv


**Run DistilBERT Test Forward Pass**

In [5]:
# Run DistilBERT test forward pass
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Set seeds for reproducibility
import numpy as np
np.random.seed(42)
torch.manual_seed(42)

# Load training data
try:
    train_data = pd.read_csv("train_ad_text.csv").head(100)
    print("Loaded train_ad_text.csv")
except Exception as e:
    print(f"Error loading train_ad_text.csv: {e}")
    raise

# Initialize tokenizer and model
try:
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    print("Initialized DistilBERT tokenizer and model")
except Exception as e:
    print(f"Error initializing DistilBERT: {e}")
    raise

# Sample ad texts
texts = train_data["ad_text"].tolist()

# Tokenize texts
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)

# Dummy labels (binary: 1 for positive)
labels = torch.ones(len(texts), dtype=torch.long)

# Use CPU explicitly
device = torch.device("cpu")
print("Using device:", device)
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = labels.to(device)

# Forward pass
try:
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    print(f"Initial loss: {loss.item()}")
except Exception as e:
    print(f"Error during forward pass: {e}")
    raise

# Clear memory
del model, inputs, labels

2025-05-31 22:22:39.951192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748730159.968554   11214 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748730159.973670   11214 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748730159.986988   11214 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748730159.987005   11214 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748730159.987008   11214 computation_placer.cc:177] computation placer alr

Initialized DistilBERT tokenizer and model
Using device: cpu
Initial loss: 0.7103846669197083


In [6]:
# Placeholder for CTR evaluation using simulated A/B testing
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Set seed for reproducibility
np.random.seed(42)

# Load training data
try:
    train_data = pd.read_csv("train_ad_text.csv").head(100)
    print("Loaded train_ad_text.csv for A/B testing")
except Exception as e:
    print(f"Error loading train_ad_text.csv: {e}")
    raise

# Simulate baseline CTR (2–5%)
baseline_ctr = np.random.uniform(0.02, 0.05, size=len(train_data))

# Simulate DistilBERT CTR (3–7%)
distilbert_ctr = np.random.uniform(0.03, 0.07, size=len(train_data))

# Create DataFrame
ab_test_df = pd.DataFrame({
    "ad_text": train_data["ad_text"],
    "baseline_ctr": baseline_ctr,
    "distilbert_ctr": distilbert_ctr
})

# Calculate mean CTRs
mean_baseline_ctr = baseline_ctr.mean()
mean_distilbert_ctr = distilbert_ctr.mean()

# Perform t-test
t_stat, p_value = ttest_ind(baseline_ctr, distilbert_ctr)

# Save results
ab_test_df.to_csv("ab_test_results.csv", index=False)

# Display results
print("A/B Test Results Summary:")
print(f"Mean Baseline CTR: {mean_baseline_ctr:.4f}")
print(f"Mean DistilBERT CTR: {mean_distilbert_ctr:.4f}")
print(f"T-test p-value: {p_value:.4f}")
print("First 5 rows of A/B test results:\n", ab_test_df.head())
print("\nSaved as ab_test_results.csv")

Loaded train_ad_text.csv for A/B testing
A/B Test Results Summary:
Mean Baseline CTR: 0.0341
Mean DistilBERT CTR: 0.0499
T-test p-value: 0.0000
First 5 rows of A/B test results:
                      ad_text  baseline_ctr  distilbert_ctr
0  Discover item 355908 now!      0.031236        0.031257
1  Discover item 248676 now!      0.048521        0.055456
2  Discover item 318965 now!      0.041960        0.042574
3  Discover item 253185 now!      0.037960        0.050343
4  Discover item 367447 now!      0.024681        0.066303

Saved as ab_test_results.csv
