## Conversational Shopping Assistant with Reinforcement Learning

#### Objective

- Recommends products based on user queries.
- Learns from user feedback over multiple conversation rounds.
- Dynamically adapts recommendations to enhance personalization.

#### Prerequisites
- Pre-trained Language Models: For natural language understanding (NLU), e.g., Hugging Face’s DistilBERT.
- RL Environment: Using gym to simulate the recommendation process.

#### Define User Scenarios and Product data

In [101]:
# %pip install transformers
# %pip install gym
# %pip install stable_baselines3
# %pip install shimmy>=2.0
# %pip install datasets
# %pip install tiktoken

from openai import OpenAI
import yaml

# Read the YAML file
with open('./../../../Curify/curify_api.yaml', 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)

# Access the API keys and other configuration data
openai_api_key = data.get('openai').get('api_key')
client = OpenAI(api_key=openai_api_key)
model="gpt-4o-mini"

#### Step 1: Load Amazon Product Metadata


In [102]:
import pandas as pd
import json
from datasets import load_dataset

import pandas as pd
import random
from openai import OpenAI
import json
from datasets import load_dataset

def get_llm_product_representation(product_title, product_details, user_prefs):
    """
    Use LLM to extract structured product features based on user preferences
    """
    prompt = f"""Extract these attributes from the product below, focusing on aspects relevant to a user who prefers {user_prefs}:
    - product_type: over-ear/in-ear/earbuds
    - price: numeric value only
    - features: list of 3-5 key characteristics
    - brand: manufacturer name
    
    Product: {product_title}
    Details: {product_details}
    
    Return JSON only, with keys: product_title, product_type, features, brand"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    try:
        return json.loads(response.choices[0].message.content)
    except:
        return None

def process_products(sampled_df, user_prefs, sample_size=20):
    """
    Process headphone products and generate structured representations
    """
    # Random sample
    
    results = []
    for _, row in sampled_df.iterrows():
        # Get LLM representation
        product_info = get_llm_product_representation(
            product_title=row['title'],
            product_details=str(row['details']),
            user_prefs=user_prefs
        )
        
        if product_info:
            product_info['price'] = row['price']
            results.append(product_info)
    
    return results


def process_data_sample(dataset_str, meta_key, user_prefs, sampling_percent=1.0, sample_size=20):
    """
    Load and process Amazon reviews and metadata, sample a percentage of reviews, and return interactions and product metadata.

    Args:
        dataset_str (str): Hugging Face dataset path (e.g., "McAuley-Lab/Amazon-Reviews-2023")
        review_key (str): Subset key for reviews (e.g., "raw_review_Books")
        meta_key (str): Subset key for metadata (e.g., "raw_meta_Books")
        sampling_percent (float): Percentage of data to load (e.g., 1.0 for 1%)

    Returns:
        interaction_df (pd.DataFrame): Processed review interactions with user/item/rating info
        product_df (pd.DataFrame): Product metadata including item_id, title, category
    """

    # Load full metadata
    metadata = load_dataset(dataset_str, meta_key, split="full", trust_remote_code=True)
    metadata_df = pd.DataFrame(metadata)
    metadata_df = metadata_df[['parent_asin', 'title', 'main_category', 'categories', 'details', 'price']]
    # Process metadata and keep relevant fields
    
    metadata_df.rename(columns={
        'parent_asin': 'item_id'
    }, inplace=True)

    headphone_df = metadata_df[metadata_df['categories'].astype(str).str.contains('Headphone', case=False, na=False)]

    headphone_df['price'] = pd.to_numeric(headphone_df['price'], errors='coerce').fillna(1000)
    headphone_df = headphone_df[headphone_df['price'] < 1000]

    sampled_df = headphone_df.sample(min(sample_size, len(headphone_df)))

    # Generate product representations
    product_reps = process_products(sampled_df, user_prefs)

    # Save results
    print(f"Generated {len(product_reps)} product representations")

    return product_reps


#### Step 2: Process User Conversation into Interaction History


In [103]:
from transformers import pipeline

def analyze_history(history):
    history_str = json.dumps(history)
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": """Analyze conversation history and respond with a JSON object:
            - preferred_brand: string
            - avoided_features: list
            - budget_range: [min,max]
            - implicit_type_pref: string"""},
            {"role": "user", "content": history_str}
        ]
    )
    return json.loads(response.choices[0].message.content.strip())


#### Step 3: Build the Conversational Layer

**Intent Classification**

Use a zero-shot classifier (e.g., Hugging Face's BART) to identify user intent.

In [104]:
from openai import OpenAI
import json

def process_current_query(query):
    response = client.chat.completions.create(
        model=model,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": """Extract EXACT preferences from this query as JSON with:
            - type: over-ear/in-ear/earbuds
            - price_max: number
            - features: list
            - use_case: string
            - urgency: high/medium/low"""},
            {"role": "user", "content": query}
        ]
    )
    return json.loads(response.choices[0].message.content.strip())


#### Step 4: Merge History-based with Query-based Preference

In [105]:
def merge_preferences(current, historical):
    return {
        # Current query takes priority
        "product_type": current.get("type") or historical["implicit_type_pref"],
        "price_max": current.get("price_max") or historical["budget_range"][1],
        # Combine features
        "features": list(set(current["features"] + ["long-battery"]))
                    if historical["preferred_brand"] == "Sony" 
                    else current["features"],
        # Use historical brand unless contradicted
        "brand": historical["preferred_brand"],
        }


#### Step 5: Reward Function and RL Framework

** RL Environment Setup**

Define the RL environment using gym. This environment simulates the recommendation process, where:

- State Space: User preferences, conversation history, previous recommendations.
- Action Space: Products available for recommendation.
- Reward Signal: User feedback (e.g., clicks, purchases).

**Reward Function**
- Positive Reward: User clicks or purchases a recommended product.
- Negative Reward: User ignores or dislikes a recommendation.
- Neutral Reward: User views but takes no action.

In [106]:
import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO

import gym
from gym import spaces
import numpy as np

class RecommendationEnv(gym.Env):
    def __init__(self, products, user_preferences, reward_type='feature_count'):
        super().__init__()
        self.products = products
        self.user_preferences = user_preferences
        self.reward_type = reward_type
        self.action_space = spaces.Discrete(len(self.products))
        self.observation_space = spaces.Box(low=0, high=1, shape=(len(self.products),), dtype=float)
        self.reset()

    def reset(self):
        self.state = np.zeros(len(self.products), dtype=np.float32)  # <-- proper shape and dtype
        self.current_step = 0
        return self.state

    def step(self, action):
        product = self.products[action]
        reward = self._calculate_reward(product)
        self.state[action] += 1
        self.current_step += 1
        done = self.current_step >= 5
        return self.state, reward, done, {}

    def _calculate_reward(self, product):
        """Main reward router"""
        base_score = self._calculate_base_score(product)
        
        if self.reward_type == 'feature_count':
            return base_score
        else:
            return base_score + self._apply_piecewise_rules(product)

    def safe_price(self, p):
        try:
            return float(p)
        except (TypeError, ValueError):
            return 1000

    def _calculate_base_score(self, product):
        """Shared scoring logic for both reward types"""
        # Feature matching
        user_features = set(self.user_preferences.get('features', []))
        product_features = set(product.get('features', []))
        feature_score = len(user_features & product_features) / max(1, len(user_features))
        
        # Type and brand matching
        type_match = product['product_type'] == self.user_preferences.get('product_type', '')
        brand_match = product['brand'].lower() == self.user_preferences.get('brand', '').lower()
        
        price_budget = self.user_preferences.get('price_max', float('inf'))

        if self.safe_price(product.get('price', 1000)) > price_budget:
            return 0.0

        return min(1.0, 0.5 * type_match + 0.3 * type_match+ 0.2 * brand_match)

    def _apply_piecewise_rules(self, product):
        """Gradual price scaling version"""
        price_budget = self.user_preferences.get('price_max', float('inf'))
        price = product.get('price', 1000)

        price_ratio = min(1.0, price / price_budget) if price_budget  > 0 else 0
        
        # Apply price scaling
        if price <= price_budget:
            price_modifier = 0.5 * (1 + price_ratio)  # 0.5-1.0 scaling
        else:
            overshoot = (price - price_budget) / max(1, price_budget)
            price_modifier = max(0, 0.5 - 0.2 * overshoot)  # Penalize overspending
            
        return price_modifier

#### Step 6: Train RL model and recommend products with trained RL.

In [112]:
def train_and_evaluate(products, user_prefs, reward_type='feature_count', 
                      num_episodes=50, episode_steps=200, test_steps=5):
    """
    Train and evaluate PPO model with specified reward function
    
    Args:
        products: List of product dictionaries
        user_prefs: User preference dictionary
        reward_type: 'feature_count' or 'piecewise_linear'
        num_episodes: Number of training episodes
        episode_steps: Steps per episode
        test_steps: Number of test steps to run
        
    Returns:
        Tuple: (rewards_over_time, test_results)
    """
    # Initialize environment with specified reward function
    env = RecommendationEnv(products, user_prefs, reward_type)
    model = PPO("MlpPolicy", env, verbose=0)
    
    # -----------------------------
    # Training Phase
    # -----------------------------
    rewards_over_time = []
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        
        for _ in range(episode_steps):
            action, _ = model.predict(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
                
        rewards_over_time.append(total_reward)
        model.learn(total_timesteps=episode_steps, reset_num_timesteps=False)
    
    # -----------------------------
    # Plotting
    # -----------------------------
    plt.figure(figsize=(8, 6))
    plt.plot(rewards_over_time, marker='o')
    plt.title(f"Reward Over Training ({reward_type} rewards)")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.grid(True)
    plt.tight_layout()
    
    plot_path = f"reward_plot_{reward_type}.png"
    plt.savefig(plot_path)
    plt.close()
    print(f"✅ Reward plot saved to: {plot_path}")
    
    # -----------------------------
    # Testing
    # -----------------------------
    test_results = []
    state = env.reset()
    
    test_output_path = f"test_output_{reward_type}.txt"
    with open(test_output_path, "w") as f:
        f.write(f"🧪 Test Results ({reward_type} rewards):\n")
        
        for i in range(test_steps):
            action, _ = model.predict(state)
            state, reward, done, _ = env.step(action)
            product = products[action]
            
            output_line = (f"Step {i+1}: Recommended: {product['product_title']} | "
                          f"Type: {product['product_type']} | "
                          f"Price: ${product['price']} | "
                          f"Reward: {reward:.2f}\n")
            
            f.write(output_line)
            test_results.append(output_line.strip())
    
    print(f"✅ Test output saved to: {test_output_path}")
    return rewards_over_time, test_results

In [108]:

# Step 2: Process Conversation into Interaction History
interaction_history = [
    {
        "user": "Which Sony headphones have the best battery life?",
        "bot": "Sony WH-1000XM5 (30hrs). Would you like details?",
        "clicked_product": "Sony WH-1000XM5",
        "purchased": False
    },
    {
        "user": "Show me over-ear headphones under $100",
        "bot": "Recommended: Bose QuietComfort 45 ($79)",
        "clicked_product": "Bose QuietComfort 45",
        "purchased": True
    },
    {
        "user": "I hate how these earbuds hurt my ears",
        "bot": "Shall I suggest over-ear options?",
        "clicked_product": None,
        "purchased": False
    }
]

# Usage
historical_prefs = analyze_history(interaction_history)

# Output user preferences
print(f"User Preferences from interaction history: {historical_prefs}")

# Step 3: Conversational Layer
user_input = "I need comfortable headphones for travel with good noise cancellation. My budget is around $100."

current_prefs = process_current_query(user_input)
final_prefs = merge_preferences(current_prefs, historical_prefs)
final_prefs

from datasets import load_dataset
from stable_baselines3 import PPO

# 1. Load data
products = process_data_sample(dataset_str="McAuley-Lab/Amazon-Reviews-2023", meta_key="raw_meta_Electronics", 
user_prefs=final_prefs, sampling_percent=1, sample_size=20)


User Preferences from interaction history: {'preferred_brand': 'Sony', 'avoided_features': ['earbuds'], 'budget_range': [0, 100], 'implicit_type_pref': 'over-ear'}
Generated 20 product representations


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  headphone_df['price'] = pd.to_numeric(headphone_df['price'], errors='coerce').fillna(1000)


In [116]:
historical_prefs

{'preferred_brand': 'Sony',
 'avoided_features': ['earbuds'],
 'budget_range': [0, 100],
 'implicit_type_pref': 'over-ear'}

In [111]:
products

[{'product_title': 'FIGMASU Headphones Wireless Bluetooth Neckband Wireless Headsets for Sport',
  'product_type': 'on-ear',
  'features': ['built-in microphone', '100 H playtime', 'sweatproof'],
  'brand': 'FIGMASU',
  'price': 29.98},
 {'product_title': 'Earpads Compatible with LS31 LS41 LS35X LS50X Headset with Microphone Foam I Replacement Ear Cushion (Cooling Gel Fabric)',
  'product_type': 'over-ear',
  'features': ['comfortable', 'replacement ear cushion', 'Cooling Gel Fabric'],
  'brand': 'TRANSTEK',
  'price': 29.99},
 {'product_title': '6 Pairs Galaxy Buds 2 Pro/Galaxy Buds 2 Memory Foam Ear Tips Buds',
  'product_type': 'in-ear',
  'features': ['reduce noise',
   'comfortable fit',
   'compatible with Galaxy Buds 2 / Galaxy Buds 2 Pro'],
  'brand': 'IiEXCEL',
  'price': 9.99},
 {'product_title': 'OneOdio A11 Wireless Headphones Over Ear',
  'product_type': 'over-ear',
  'features': ['comfortable', 'long-battery', 'good noise cancellation'],
  'brand': 'OneOdio',
  'price': 1

In [113]:

# Run with feature_count rewards
fc_rewards, fc_results = train_and_evaluate(
    products, final_prefs, reward_type='feature_count'
)

# Run with piecewise_linear rewards
pw_rewards, pw_results = train_and_evaluate(
    products, final_prefs, reward_type='piecewise_linear'
)

✅ Reward plot saved to: reward_plot_feature_count.png
✅ Test output saved to: test_output_feature_count.txt
✅ Reward plot saved to: reward_plot_piecewise_linear.png
✅ Test output saved to: test_output_piecewise_linear.txt


In [76]:
# metadata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", split="full", trust_remote_code=True)
# metadata_df = pd.DataFrame(metadata)
# metadata_df = metadata_df[['parent_asin', 'title', 'main_category', 'categories', 'details', 'price']]
# # Process metadata and keep relevant fields

# metadata_df.rename(columns={
#     'parent_asin': 'item_id'
# }, inplace=True)

# headphone_df = metadata_df[metadata_df['categories'].astype(str).str.contains('Headphone', case=False, na=False)]


Unnamed: 0,item_id,title,main_category,categories,details,price
8,B01MCZP7RF,MOSISO Plastic Hard Shell Case & Keyboard Cove...,Computers,"[Electronics, Headphones, Earbuds & Accessorie...","{""Brand"": ""MOSISO"", ""Item Weight"": ""8 ounces"",...",
34,B06XJHRP5G,MXditect EB166 Headphones with Mic Stereo Earp...,All Electronics,"[Electronics, Headphones, Earbuds & Accessorie...","{""Product Dimensions"": ""3.35 x 3.35 x 0.98 inc...",
41,B07Z9PZRCY,HOSONGIN 1/4 Inch Female to 3.5mm TRS Male Ada...,All Electronics,"[Electronics, Headphones, Earbuds & Accessorie...","{""Package Dimensions"": ""4.41 x 3.39 x 0.35 inc...",
51,B07C4YHF4L,"AvimaBasics Ear Buds, Spare Kit Earloops Buds ...",All Electronics,"[Electronics, Headphones, Earbuds & Accessorie...","{""Package Dimensions"": ""5.24 x 3.23 x 0.67 inc...",9.99
53,B07MVKYVVL,10-Pack Replacement Clamp Bluetooth Ear Hook L...,Home Audio & Theater,"[Electronics, Headphones, Earbuds & Accessorie...","{""Package Dimensions"": ""5.75 x 3.86 x 0.31 inc...",3.49


In [109]:
products

[{'product_title': 'FIGMASU Headphones Wireless Bluetooth Neckband Wireless Headsets for Sport',
  'product_type': 'on-ear',
  'features': ['built-in microphone', '100 H playtime', 'sweatproof'],
  'brand': 'FIGMASU',
  'price': 29.98},
 {'product_title': 'Earpads Compatible with LS31 LS41 LS35X LS50X Headset with Microphone Foam I Replacement Ear Cushion (Cooling Gel Fabric)',
  'product_type': 'over-ear',
  'features': ['comfortable', 'replacement ear cushion', 'Cooling Gel Fabric'],
  'brand': 'TRANSTEK',
  'price': 29.99},
 {'product_title': '6 Pairs Galaxy Buds 2 Pro/Galaxy Buds 2 Memory Foam Ear Tips Buds',
  'product_type': 'in-ear',
  'features': ['reduce noise',
   'comfortable fit',
   'compatible with Galaxy Buds 2 / Galaxy Buds 2 Pro'],
  'brand': 'IiEXCEL',
  'price': 9.99},
 {'product_title': 'OneOdio A11 Wireless Headphones Over Ear',
  'product_type': 'over-ear',
  'features': ['comfortable', 'long-battery', 'good noise cancellation'],
  'brand': 'OneOdio',
  'price': 1