### Stock Recommendation System with Reinforcement Learning - RL Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import norm
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [36]:
# Load data
df = pd.read_csv("data.csv")

In [None]:
# Stock categories
blue_chip_stocks = ["AAPL", "MSFT", "JNJ", "V", "JPM", "PG", "UNH", "HD", "XOM", "KO"]
growth_stocks = ["TSLA", "NVDA", "AMZN", "META", "SHOP", "NFLX", "AMD", "CRWD", "DDOG", "PLTR"]
midcap_tech = ["UBER", "ABNB", "ROKU", "ZS", "OKTA", "SNOW", "MDB", "NET", "DOCU", "BILL"]
dividend_value = ["T", "VZ", "PFE", "CVX", "WMT", "MCD", "PEP", "MMM", "IBM", "KO"]
cyclical_industrials = ["CAT", "DE", "BA", "HON", "GE", "F", "GM", "LMT", "NOC", "UPS"]

# Initial style mapping
raw_styles = {
    "growth": growth_stocks + ["GE", "F", "GM"],
    "value": blue_chip_stocks + ["CAT", "HON"],
    "conservative": dividend_value + ["MMM", "UPS"],
    "trader": midcap_tech + ["DE", "BA", "LMT", "NOC"]
}

valid_stocks = set(df.columns)
styles = {
    style: [s for s in stock_list if s in valid_stocks]
    for style, stock_list in raw_styles.items()
}

In [None]:
df_eval = df[df["Date"] >= "2025-01-01"].copy()
all_stocks = df['Stock'].unique().tolist()

#### Define Evaluation Metrics

In [40]:
# ------------------------
# Evaluation: Simulated Reward / Sharpe
# ------------------------
def evaluate_sharpe(returns):
    returns = np.array(returns)
    if len(returns) == 0 or np.std(returns) == 0:
        return 0.0
    return np.mean(returns) / np.std(returns)

def preference_alignment_score(recommended_stock, current_style, styles):
    """Score whether the recommended stock aligns with current user preference"""
    return 1 if recommended_stock in styles[current_style] else 0

#### Training and Evaluating RL Models

In [41]:
# ------------------------
# Reinforcement Learning Environment Setup
# ------------------------
import gymnasium as gym
from gymnasium import spaces
import random

class StockRecommendationEnv(gym.Env):
    def __init__(self, df, stock_list, styles):
        super(StockRecommendationEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.stock_list = stock_list
        self.styles = styles
        self.num_stocks = len(stock_list)

        self.action_space = spaces.Discrete(self.num_stocks)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(8,), dtype=np.float32)

        self.current_index = 0
        self.max_index = len(self.df) - 2
        self.risk_preference_map = {"growth": 1.2, "value": 1.0, "conservative": 0.8, "trader": 1.5}

    def reset(self, seed=None, options=None):
        self.current_index = 0
        state = self._get_state()
        return state, {}

    def _get_state(self):
        row = self.df.iloc[self.current_index]
        style = row["Style"]
        risk_pref = self.risk_preference_map[style]
        state =  np.array([
            row["Daily_Return"], row["Volatility"], row["Momentum"],
            row["MA_10"], row["Volatility_10"], row["Shares"],
            row["Price"], risk_pref
        ], dtype=np.float32)
        return np.nan_to_num(state, nan=0.0, posinf=1e6, neginf=-1e6)

    def step(self, action):
        row = self.df.iloc[self.current_index]
        style = row["Style"]
        chosen_stock = self.stock_list[action]
        next_price_row = self.df[(self.df.Stock == chosen_stock) & (self.df.index > self.current_index)].head(1)

        if next_price_row.empty:
            reward = 0
            alignment = 0
        else:
            price_now = row["Price"]
            price_next = next_price_row["Price"].values[0]
            return_component = (price_next - price_now) * self.risk_preference_map[style]
            alignment = 1 if chosen_stock in self.styles[style] else 0
            alpha, beta = 0.5, 0.5
            reward = alpha * return_component + beta * alignment

        self.current_index += 1
        done = self.current_index >= self.max_index
        state = self._get_state() if not done else np.zeros(self.observation_space.shape)

        return state, reward, done, False, {"alignment": alignment}


In [None]:
# ------------------------
# PPO Setup
# ------------------------
# Actor-Critic
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, x):
        probs = self.actor(x)
        value = self.critic(x)
        return probs, value

# Collect Trajectories
def collect_trajectories(env, policy, timesteps_per_batch):
    state, _ = env.reset()
    buffer = {"states": [], "actions": [], "log_probs": [], "values": [], "rewards": [], "dones": []}
    for _ in range(timesteps_per_batch):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        probs, value = policy(state_tensor)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        buffer["states"].append(state_tensor.squeeze(0))
        buffer["actions"].append(action)
        buffer["log_probs"].append(dist.log_prob(action))
        buffer["values"].append(value.squeeze(0))
        buffer["rewards"].append(torch.tensor([reward]))
        buffer["dones"].append(torch.tensor([done]))

        state = next_state if not done else env.reset()
    return buffer

# GAE
def compute_gae(rewards, values, dones, gamma=0.99, lam=0.95):
    advantages, returns = [], []
    gae = 0
    values = values + [torch.tensor([0.0])]
    for t in reversed(range(len(rewards))):
        done_mask = 1.0 - dones[t].float()
        delta = rewards[t] + gamma * values[t+1] * done_mask - values[t]
        gae = delta + gamma * lam * done_mask * gae
        advantages.insert(0, gae)
    returns = [adv + val for adv, val in zip(advantages, values[:-1])]
    return advantages, returns

# PPO Update
def ppo_update(policy, optimizer, states, actions, log_probs_old, returns, advantages,
               clip_eps=0.2, critic_coef=0.5, entropy_coef=0.01):
    states = torch.stack(states).detach()
    actions = torch.stack(actions).detach()
    old_log_probs = torch.stack(log_probs_old).detach()
    returns = torch.stack(returns).detach()
    advantages = torch.stack(advantages).detach()

    for _ in range(5):  # epoch
        probs, values = policy(states)
        dist = torch.distributions.Categorical(probs)
        log_probs = dist.log_prob(actions)
        entropy = dist.entropy().mean()

        ratio = torch.exp(log_probs - old_log_probs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
        actor_loss = -torch.min(surr1, surr2).mean()
        critic_loss = (returns - values.squeeze()).pow(2).mean()
        loss = actor_loss + critic_coef * critic_loss - entropy_coef * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return actor_loss.item(), critic_loss.item()

# Train PPO
def train_ppo(env, state_dim, action_dim, total_timesteps=100000, batch_size=2048, lr=2e-4):
    policy = ActorCritic(state_dim, action_dim)
    optimizer = optim.Adam(policy.parameters(), lr=lr)
    timestep = 0
    while timestep < total_timesteps:
        buffer = collect_trajectories(env, policy, batch_size)
        advs, rets = compute_gae(buffer["rewards"], buffer["values"], buffer["dones"])
        actor_loss, critic_loss = ppo_update(policy, optimizer,
                                    buffer["states"], buffer["actions"], buffer["log_probs"],
                                    rets, advs
                                )
        timestep += batch_size
        print(f"Step {timestep} | Actor Loss: {actor_loss:.4f} | Critic Loss: {critic_loss:.4f}")
    return policy


#### Single-Agent RL: Scalar

In [None]:
# -----------------------------
# Single-Agent RL Training & Evaluation
# -----------------------------
def train_and_evaluate_rl(df, stock_list, styles, holding_days=5):
    train_df = df[df["Date"] < "2025-01-01"].copy()
    test_df = df[df["Date"] >= "2025-01-01"].copy()
    env = StockRecommendationEnv(train_df, stock_list, styles)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    model = train_ppo(env, state_dim, action_dim, total_timesteps=100000)
    eval_env = StockRecommendationEnv(test_df, stock_list, styles)
    state, _ = eval_env.reset()
    returns, alignments = [], []
    while True:
        action, _ = model.predict(state)
        next_state, reward, terminated, truncated, info = eval_env.step(action.item())
        done = terminated or truncated
        returns.append(reward)
        alignments.append(info.get("alignment", 0))
        if done:
            break
    return np.mean(returns), evaluate_sharpe(returns), np.mean(alignments)

df = df.dropna(subset=["Price", "Shares"]).copy()
ret_rl, sharpe_rl, align_rl = train_and_evaluate_rl(df, all_stocks, styles)

In [None]:
print("[Single-RL Agent]")
print(f"Average Return: {ret_rl:.4f}, Sharpe: {sharpe_rl:.4f}, Alignment: {align_rl:.4f}")

#### Single-Agent RL: One-hot

In [None]:
# ------------------------
# (One-hot) Reinforcement Learning Environment Setup
# ------------------------
STYLE_TO_INDEX = {"conservative": 0, "value": 1, "growth": 2, "trader": 3}

class StockRecommendationEnv(gym.Env):
    def __init__(self, df, stock_list, styles):
        super(StockRecommendationEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.stock_list = stock_list
        self.styles = styles
        self.num_stocks = len(stock_list)

        # 8 original features + 4 style one-hot
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(12,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.num_stocks)

        self.current_index = 0
        self.max_index = len(self.df) - 2
        self.risk_preference_map = {"growth": 1.2, "value": 1.0, "conservative": 0.8, "trader": 1.5}

    def reset(self, seed=None, options=None):
        self.current_index = 0
        return self._get_state(), {}

    def _get_state(self):
        row = self.df.iloc[self.current_index]
        style = row["Style"]
        risk_pref = self.risk_preference_map[style]
        style_index = STYLE_TO_INDEX[style]
        style_one_hot = np.zeros(len(STYLE_TO_INDEX))
        style_one_hot[style_index] = 1

        core_features = np.array([
            row["Daily_Return"], row["Volatility"], row["Momentum"],
            row["MA_10"], row["Volatility_10"], row["Shares"],
            row["Price"], risk_pref
        ], dtype=np.float32)

        state = np.concatenate([core_features, style_one_hot])
        return np.nan_to_num(state, nan=0.0, posinf=1e6, neginf=-1e6)

    def step(self, action):
        row = self.df.iloc[self.current_index]
        style = row["Style"]
        chosen_stock = self.stock_list[action]
        next_price_row = self.df[(self.df.Stock == chosen_stock) & (self.df.index > self.current_index)].head(1)

        if next_price_row.empty:
            reward = 0
            alignment = 0
        else:
            price_now = row["Price"]
            price_next = next_price_row["Price"].values[0]
            return_component = (price_next - price_now) * self.risk_preference_map[style]
            alignment = 1 if chosen_stock in self.styles[style] else 0
            alpha, beta = 0.5, 0.5
            reward = alpha * return_component + beta * alignment

        self.current_index += 1
        done = self.current_index >= self.max_index
        next_state = self._get_state() if not done else np.zeros(self.observation_space.shape)

        return next_state, reward, done, False, {"alignment": alignment}

In [None]:
# -----------------------------
# Single-Agent RL Training & Evaluation
# -----------------------------
def train_and_evaluate_rl(df, stock_list, styles, holding_days=5):
    train_df = df[df["Date"] < "2025-01-01"].copy()
    test_df = df[df["Date"] >= "2025-01-01"].copy()
    env = StockRecommendationEnv(train_df, stock_list, styles)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    model = train_ppo(env, state_dim, action_dim, total_timesteps=100000)
    eval_env = StockRecommendationEnv(test_df, stock_list, styles)
    state, _ = eval_env.reset()
    returns, alignments = [], []
    while True:
        action, _ = model.predict(state)
        next_state, reward, terminated, truncated, info = eval_env.step(action.item())
        done = terminated or truncated
        returns.append(reward)
        alignments.append(info.get("alignment", 0))
        if done:
            break
    return np.mean(returns), evaluate_sharpe(returns), np.mean(alignments)

df = df.dropna(subset=["Price", "Shares"]).copy()
ret_rl, sharpe_rl, align_rl = train_and_evaluate_rl(df, all_stocks, styles)

In [None]:
print("[Single-RL Agent]")
print(f"Average Return: {ret_rl:.4f}, Sharpe: {sharpe_rl:.4f}, Alignment: {align_rl:.4f}")

#### Multi-Agent RL

In [None]:
# -----------------------------
# Multi-Agent RL Training & Evaluation
# -----------------------------
def train_multiagent_by_style(train_df, test_df, styles, stock_list, total_timesteps=200000):
    agents = {}
    results = {}

    for style in styles.keys():
        # print(f"\n[Training Agent for style: {style}]")
        train_style_df = train_df[train_df["Style"] == style].dropna().copy()
        test_style_df = test_df[test_df["Style"] == style].dropna().copy()
        if len(train_style_df) < 100:
            # print(f"Skipping {style} (too few samples)")
            continue

        env = StockRecommendationEnv(train_df, stock_list, styles)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        model = train_ppo(env, state_dim, action_dim, total_timesteps=100000)
        
        agents[style] = model

        eval_env = StockRecommendationEnv(test_df, stock_list, styles)
        state, _ = eval_env.reset()
        returns, alignments = [], []

        while True:
            action, _ = model.predict(state)
            next_state, reward, terminated, truncated, info = eval_env.step(action.item())
            done = terminated or truncated
            returns.append(reward)
            alignments.append(info.get("alignment", 0))
            if done:
                break

        avg_ret = np.mean(returns)
        sharpe = evaluate_sharpe(returns)
        align = np.mean(alignments)
        results[style] = (avg_ret, sharpe, align)
        # print(f"Style {style}: Return {avg_ret:.4f}, Sharpe {sharpe:.4f}, Alignment {align:.4f}")
        
    return agents, results

train_df = df[df["Date"] < "2025-01-01"].copy()
test_df = df[df["Date"] >= "2025-01-01"].copy()
agents, results = train_multiagent_by_style(train_df, test_df, styles, all_stocks)

In [None]:
print("[Multi-Agent RL]")
for style, (ret, sharpe, align) in results.items():
    print(f"Style {style}: Average Return: {ret:.4f}, Sharpe: {sharpe:.4f}, Alignment: {align:.4f}")

In [None]:
weights = {
    style: len(test_df[test_df["Style"] == style]) 
    for style in results.keys()
}
total = sum(weights.values())

ret_avg = sum(results[style][0] * weights[style] for style in results) / total
sharpe_avg = sum(results[style][1] * weights[style] for style in results) / total
align_avg = sum(results[style][2] * weights[style] for style in results) / total
print("[Weighted Multi-Agent RL]")
print(f"Average Return: {ret_avg:.4f}, Sharpe: {sharpe_avg:.4f}, Alignment: {align_avg:.4f}")