In [2]:
# ===============================================
# RL Recommender System (Optimized)
# ===============================================
import pandas as pd
import numpy as np
import random
import pickle
from collections import deque
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score

# ==========================
# Load Data
# ==========================
user_item_matrix = pd.read_csv("user_interactions_5000.csv")
predicted_df = pickle.load(open("als_predicted_matrix.pkl", "rb"))

# Create user-item interaction matrix
user_item_matrix_pivot = user_item_matrix.pivot_table(
    index='user_id', columns='product_id', values='interaction_value', fill_value=0
)
users = user_item_matrix_pivot.index.tolist()
products = user_item_matrix_pivot.columns.tolist()

# ==========================
# RL Environment
# ==========================
class RLEnv:
    def __init__(self, user_item_matrix):
        self.user_item_matrix = user_item_matrix.values
        self.n_users, self.n_products = self.user_item_matrix.shape
        self.state = np.zeros(self.n_products)
        self.user_idx = 0

    def reset(self):
        self.user_idx = random.randint(0, self.n_users-1)
        self.state = np.zeros(self.n_products)
        return self.state

    def step(self, action):
        # Reward = 1 if interaction > 0.7, else 0
        reward = 1 if self.user_item_matrix[self.user_idx, action] > 0.7 else 0
        self.state[action] = reward
        done = True
        return self.state, reward, done

# ==========================
# Actor Network
# ==========================
def build_actor(state_size, action_size):
    model = Sequential([
        Input(shape=(state_size,)),
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(action_size, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy')
    return model

# ==========================
# Experience Replay
# ==========================
class ReplayBuffer:
    def __init__(self, max_len=2000):
        self.buffer = deque(maxlen=max_len)

    def add(self, state, action, reward, next_state):
        self.buffer.append((state, action, reward, next_state))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, min(len(self.buffer), batch_size))
        states, actions, rewards, next_states = zip(*batch)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states)

# ==========================
# RL Training
# ==========================
env = RLEnv(user_item_matrix_pivot)
state_size = env.n_products
action_size = env.n_products
actor = build_actor(state_size, action_size)
replay = ReplayBuffer(max_len=5000)

# Pretraining with ALS (imitate ALS predictions)
for user_idx in range(env.n_users):
    state = np.zeros(state_size)
    als_probs = predicted_df.loc[users[user_idx]].values
    als_probs = als_probs / np.sum(als_probs)  # normalize
    actor.fit(state.reshape(1, -1), als_probs.reshape(1, -1), verbose=0, epochs=1)

# RL Training
episodes = 2000
batch_size = 64
gamma = 0.95  # discount factor

for ep in range(1, episodes+1):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        # Actor chooses action
        action_probs = actor.predict(state.reshape(1,-1), verbose=0)[0]
        action = np.random.choice(np.arange(action_size), p=action_probs)
        next_state, reward, done = env.step(action)
        total_reward += reward

        # Store in replay buffer
        replay.add(state, action, reward, next_state)
        state = next_state

    # Train from replay buffer
    states, actions, rewards, next_states = replay.sample(batch_size)
    if len(states) > 0:
        target = np.zeros((len(states), action_size))
        for i, (s, a, r, ns) in enumerate(zip(states, actions, rewards, next_states)):
            target[i] = actor.predict(s.reshape(1,-1), verbose=0)[0]
            target[i, a] = r + gamma * np.max(actor.predict(ns.reshape(1,-1), verbose=0)[0])
        actor.fit(states, target, verbose=0, epochs=1)

    if ep % 100 == 0:
        print(f"Episode {ep}/{episodes}, Total Reward: {total_reward:.2f}")

# Save model
actor.save("rl_recommender_model.h5")
pickle.dump(users, open("rl_users.pkl", "wb"))
pickle.dump(products, open("rl_products.pkl", "wb"))

# ==========================
# RL Evaluation
# ==========================
test_data = []
for _ in range(200):
    uid = random.choice(users)
    pid = random.choice(products)
    true_interaction = predicted_df.loc[uid, pid]
    true_interaction = np.clip(true_interaction + np.random.normal(0, 0.05), 0, 1)
    test_data.append([uid, pid, true_interaction])

test_df = pd.DataFrame(test_data, columns=['user_id','product_id','interaction_value'])

y_true, y_pred = [], []
for _, row in test_df.iterrows():
    uid, pid = row['user_id'], row['product_id']
    y_true.append(1 if row['interaction_value'] > 0.7 else 0)
    state = np.zeros(state_size)
    action_probs = actor.predict(state.reshape(1,-1), verbose=0)[0]
    action_idx = products.index(pid)
    y_pred.append(1 if action_probs[action_idx] > 0.5 else 0)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
hit_rate = np.mean([1 if t==p==1 else 0 for t,p in zip(y_true, y_pred)])

print(f"\nðŸ“Š RL Model Evaluation:")
print(f"RMSE: {rmse:.4f}")
print(f"Accuracy: {acc*100:.2f}% | Precision: {prec*100:.2f}% | Recall: {rec*100:.2f}% | F1: {f1*100:.2f}% | Hit Rate: {hit_rate*100:.2f}%")

# ==========================
# Top-N Recommendations
# ==========================
def recommend_items_rl(user_id, top_n=5):
    if user_id not in users:
        return []
    state = np.zeros(state_size)
    action_probs = actor.predict(state.reshape(1,-1), verbose=0)[0]
    recommendations_idx = np.argsort(action_probs)[::-1][:top_n]
    return [products[i] for i in recommendations_idx]

print("\nðŸš€ Top 5 recommendations for 5 random users:")
for _ in range(5):
    sample_user = random.choice(users)
    print(f"{sample_user}: {recommend_items_rl(sample_user, top_n=5)}")


Episode 100/2000, Total Reward: 0.00
Episode 200/2000, Total Reward: 0.00
Episode 300/2000, Total Reward: 0.00
Episode 400/2000, Total Reward: 0.00
Episode 500/2000, Total Reward: 0.00
Episode 600/2000, Total Reward: 1.00
Episode 700/2000, Total Reward: 0.00
Episode 800/2000, Total Reward: 1.00
Episode 900/2000, Total Reward: 0.00
Episode 1000/2000, Total Reward: 0.00
Episode 1100/2000, Total Reward: 0.00
Episode 1200/2000, Total Reward: 0.00
Episode 1300/2000, Total Reward: 0.00
Episode 1400/2000, Total Reward: 0.00
Episode 1500/2000, Total Reward: 0.00
Episode 1600/2000, Total Reward: 0.00
Episode 1700/2000, Total Reward: 0.00
Episode 1800/2000, Total Reward: 0.00
Episode 1900/2000, Total Reward: 0.00




Episode 2000/2000, Total Reward: 1.00

ðŸ“Š RL Model Evaluation:
RMSE: 0.5477
Accuracy: 70.00% | Precision: 25.00% | Recall: 1.72% | F1: 3.23% | Hit Rate: 0.50%

ðŸš€ Top 5 recommendations for 5 random users:
U004: ['P132', 'P150', 'P148', 'P147', 'P146']
U047: ['P132', 'P150', 'P148', 'P147', 'P146']
U047: ['P132', 'P150', 'P148', 'P147', 'P146']
U079: ['P132', 'P150', 'P148', 'P147', 'P146']
U055: ['P132', 'P150', 'P148', 'P147', 'P146']
