In [1]:
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.cluster import SpectralBiclustering

In [2]:
def stream_first_n_json_objects(file_path, n):
    """
    Efficiently reads the first `n` JSON objects from a large JSON array.
    """
    records = []
    obj = ''
    depth = 0
    started = False
    count = 0

    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            char = f.read(1)
            if not char:
                break

            if char == '{':
                depth += 1
                started = True

            if started:
                obj += char

            if char == '}':
                depth -= 1
                if depth == 0 and started:
                    try:
                        records.append(json.loads(obj))
                        count += 1
                        if count >= n:
                            break
                    except json.JSONDecodeError:
                        pass
                    obj = ''
                    started = False

    return records

In [3]:
# Load first 100000 entries efficiently
subset = stream_first_n_json_objects("rl_dataset_train.json", 200000)

In [4]:
train_df = pd.DataFrame(subset)

print(train_df.columns)
print(train_df.head(3))

Index(['user_id', 'state', 'action', 'action_time_segment', 'user_profile',
       'business_checkin', 'business_categories', 'reward', 'next_state'],
      dtype='object')
                  user_id                                              state  \
0  mh_-eMZ6K5RLWhZyISBhwA  [{'business_id': 'WL-0PLW5IzdnyUHGmiOrgQ', 'st...   
1  mh_-eMZ6K5RLWhZyISBhwA  [{'business_id': 'gpYBhnTk4KzvvH83TsZiQg', 'st...   
2  mh_-eMZ6K5RLWhZyISBhwA  [{'business_id': 'VvqYQ98FjO0iYpKHgu91fw', 'st...   

                   action action_time_segment  \
0  rQ1t0zD_TBTqCF06By_UgA           afternoon   
1  XsvxRd2u8iRD_S50ZJ5-QQ           afternoon   
2  sYgyAxvuDP1799oiGXqE_A             evening   

                                        user_profile  \
0  {'review_count': 33, 'average_stars': 4.06, 'f...   
1  {'review_count': 33, 'average_stars': 4.06, 'f...   
2  {'review_count': 33, 'average_stars': 4.06, 'f...   

                                  business_checkin  \
0   {'morning': 0, 'afternoon'

### STEP 2: Build User-Item Matrix for Biclustering

Create a binary matrix where rows are users and columns are businesses. A `1` indicates the user has recently interacted with that business.

In [5]:
# Step 1: Extract user_id and associated business_ids from 'state'
user_history = defaultdict(set)

for row in train_df.itertuples():
    try:
        user_id = row.user_id
        for interaction in row.state:
            business_id = interaction.get("business_id")
            if business_id:
                user_history[user_id].add(business_id)
    except Exception:
        continue

# 1.2 Build binary user-item matrix
all_users = list(user_history.keys())
all_businesses = list({b for bs in user_history.values() for b in bs})

user_item_matrix = pd.DataFrame(0, index=all_users, columns=all_businesses)

for user, businesses in user_history.items():
    user_item_matrix.loc[user, list(businesses)] = 1

print("User-item matrix shape:", user_item_matrix.shape)

User-item matrix shape: (3749, 61386)


In step 1.2:
1. This creates a **binary interaction matrix**: 
- rows = Users
- columns = business

This matrix is used for **biclustering** in the next step:
- It enables the model to group similar users and similar businesses.
- Helps reduce the size of the Q-table by abstracting to **clusters**.

### STEP 3: Apply Biclustering

Transforms the raw user-item interactions into **clustered states**, dramatically simplifying the Q-learning state space.

In [16]:
n_clusters = 50  # tune this
model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=42)
model.fit(user_item_matrix)

# Map user and business IDs to their cluster assignments
user_clusters = dict(zip(user_item_matrix.index, model.row_labels_))
business_clusters = dict(zip(user_item_matrix.columns, model.column_labels_))

print("Biclustering complete.")
print(f"User clusters: {len(set(model.row_labels_))}")
print(f"Business clusters: {len(set(model.column_labels_))}")

Biclustering complete.
User clusters: 50
Business clusters: 50


`SpectralBiclustering` finds **co-clusters**:

- Groups of users that behave similarly.
- Groups of businesses that are frequently co-interacted with.

`n_clusters=50`:Get 50 user clusters and 50 business clusters.

`alpha`: Controls how much new information overrides old Q-values.

### STEP 4: Define the Q-Learning Agent

Create a class that learns a value function `Q(state, action)` to recommend items based on learned behavior.

In [17]:
businesses_by_cluster = defaultdict(list)
for b_id, cluster_id in business_clusters.items():
    businesses_by_cluster[cluster_id].append(b_id)

For each business ID `(b_id)` and its corresponding cluster ID `(cluster_id)`:

Appending that business to a list of businesses assigned to that cluster.

In [18]:
import random

class ClusteredQLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.95, epsilon=0.1):
        self.q_table = defaultdict(lambda: defaultdict(float))  # Q[state][action] = value
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.actions = actions  # all business_ids

    def get_state(self, user_id, time_segment):
        user_cluster = user_clusters.get(user_id, -1)
        return (user_cluster, time_segment)

    #def choose_action(self, state):
    #    q_vals = self.q_table[state]
    #    if q_vals:
    #        return max(q_vals, key=q_vals.get)
    #    return random.choice(self.actions)
    
    def choose_action(self, state):
        q_vals = self.q_table[state]
        if q_vals:
            return max(q_vals, key=q_vals.get)

    # fallback: recommend from user’s business cluster
        user_cluster_id = state[0]
        cluster_businesses = businesses_by_cluster.get(user_cluster_id)

        if cluster_businesses:
            return random.choice(cluster_businesses)

        return random.choice(self.actions)
    
    def update(self, state, action, reward, next_state):
        max_q_next = max(self.q_table[next_state].values(), default=0.0)
        current_q = self.q_table[state][action]
        self.q_table[state][action] = current_q + self.alpha * (reward + self.gamma * max_q_next - current_q)

Uses **cluster IDs** to define states.

Implements **Q-learning** with:

- `choose_action`: balances exploration and exploitation.

- `update`: updates Q-values after observing reward.


### STEP 5: Train the Agent on  Dataset

Use each (user, action, reward, next_state) tuple from `train_df` to train the Q-table.


In [19]:
# Get all known business IDs from the matrix
all_business_ids = list(user_item_matrix.columns)

# Instantiate agent
agent = ClusteredQLearningAgent(actions=all_business_ids)

# Train using actual user actions from the dataset
for row in train_df.itertuples():
    try:
        user_id = row.user_id
        business_id = row.action  # actual clicked/engaged business
        reward = row.reward
        time_segment = row.action_time_segment  # e.g. 'morning', 'evening'

        state = agent.get_state(user_id, time_segment)
        next_state = agent.get_state(user_id, time_segment)  # simplified

        agent.update(state, business_id, reward, next_state)

    except Exception as e:
        print("Training error:", e)
        continue

print(f" Training complete. Q-table has {len(agent.q_table)} states.")


 Training complete. Q-table has 150 states.


- `row.action`: This is the actual business ID the user interacted with → this is your action in Q-learning.

- `reward`: Binary reward based on whether the interaction was positive.

- `time_segment`: Time context (e.g., "evening", "morning") — part of your state abstraction.

## Evaluate the Q-learning Agent on `rl_dataset_test.json`


Measure how well the trained agent recommends businesses to users using unseen test data.

In [10]:
# Load first 100000 entries efficiently
test_data = stream_first_n_json_objects("rl_dataset_test.json", 50000 )
test_df = pd.DataFrame(test_data)

 ### Step 2: Evaluate the Agent

In [20]:
# Businesses seen during training
known_businesses = set(train_df["action"])  # All valid actions for evaluation

Only want to recommend businesses that were present in the training set.

Otherwise, the agent might suggest businesses it’s never learned about (which is unfair and unhelpful).

In [21]:
# Before training loop
from collections import Counter

business_counter = Counter(train_df["action"])
most_popular_businesses = [b for b, _ in business_counter.most_common()]


In [22]:
from tqdm import tqdm

# Precompute fallback action space (optional, for speed)
businesses_by_cluster = defaultdict(list)
for b_id, cluster_id in business_clusters.items():
    businesses_by_cluster[cluster_id].append(b_id)


# Evaluate
TOP_K = 50
total_predictions = 0
total_relevant = 0
hits = 0
rewards = []
state_cache = {}

# Evaluate
for entry in tqdm(test_data):
    try:
        user_id = entry["user_id"]

        # Fix if action is a nested dict
        true_business_id = entry["action"] if isinstance(entry["action"], str) else entry["action"]["business_id"]

        # Also fix if time segment is nested
        time_segment = entry.get("action_time_segment", None)
        if not time_segment and isinstance(entry["action"], dict):
            time_segment = entry["action"].get("action_time_segment", "unknown")

        reward = entry["reward"]

        # Cache user-time state
        key = (user_id, time_segment)
        if key not in state_cache:
            state_cache[key] = agent.get_state(user_id, time_segment)
        state = state_cache[key]

        # Main recommendation logic
        #already_seen = user_history.get(user_id, set())
        q_vals = agent.q_table[state]
        q_vals = agent.q_table[state]
        if q_vals:
            top_k_businesses = sorted(q_vals.items(), key=lambda x: -x[1])
            recommended_businesses = [
                b for b, _ in top_k_businesses if b in known_businesses
            ][:TOP_K]

        else:
            # Fallback: recommend from same cluster, filtered to known businesses
            user_cluster = state[0]
            fallback_businesses = businesses_by_cluster.get(user_cluster, agent.actions)
            filtered_fallbacks = [b for b in fallback_businesses if b in known_businesses]

            if filtered_fallbacks:
                recommended_businesses = random.sample(filtered_fallbacks, min(TOP_K, len(filtered_fallbacks)))
            else:
                recommended_businesses = most_popular_businesses[:TOP_K]

        total_predictions += 1
        total_relevant += 1

        if true_business_id in recommended_businesses:
            hits += 1
            rewards.append(reward)
        else:
            rewards.append(0)

    except Exception as e:
        continue  # skip any faulty records

# Compute metrics
precision_at_k = hits / total_predictions if total_predictions > 0 else 0
recall_at_k = hits / total_relevant if total_relevant > 0 else 0
f1_at_k = (
    2 * precision_at_k * recall_at_k / (precision_at_k + recall_at_k)
    if (precision_at_k + recall_at_k) > 0 else 0
)

print(f"\nTop-{TOP_K} Evaluation complete on {total_predictions} test samples")
print(f"Precision@{TOP_K}: {precision_at_k:.3f}")
print(f"Recall@{TOP_K}:    {recall_at_k:.3f}")
print(f"F1 Score@{TOP_K}:  {f1_at_k:.3f}")


100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [03:15<00:00, 255.62it/s]


Top-50 Evaluation complete on 50000 test samples
Precision@50: 0.018
Recall@50:    0.018
F1 Score@50:  0.018



