In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
from datetime import datetime, timedelta
import os

# Constants
EPSILON = 1e-8
action_space = np.array([-1.0, -0.5, -0.1, 0, 0.1, 0.5, 1.0])
temperature = 1.0  # Softmax temperature
WINDOW_SIZE = 10
RISK_FREE_RATE = 0.01  # Assuming a 1% risk-free rate

def choose_action(action, portfolio_value, cash, shares, current_price):
    """
    Determine the number of shares to trade based on the action
    Returns positive number for buying, negative for selling
    """
    max_possible_shares = cash / current_price
    if action > 0:  # Buy
        shares_to_trade = min(max_possible_shares, portfolio_value * abs(action) / current_price)
        return np.floor(shares_to_trade)
    elif action < 0:  # Sell
        shares_to_trade = min(shares, portfolio_value * abs(action) / current_price)
        return -np.floor(shares_to_trade)
    return 0

class LSTMTrader(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMTrader, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        final_output = lstm_out[:, -1, :]
        return self.fc(final_output)

def load_data(ticker="AAPL", start_date="2020-01-01", end_date="2023-01-01"):
    df = yf.download(ticker, start=start_date, end=end_date)
    df = df.dropna()
    return df

def softmax(q_values, temperature):
    if torch.any(torch.isnan(q_values)):
        print("NaN found in q_values!")
        return torch.zeros_like(q_values)  # Return zero if NaNs are found
    scaled_qs = q_values / temperature
    exp_qs = torch.exp(scaled_qs - torch.max(scaled_qs))  # Subtract max for numerical stability
    probs = exp_qs / (torch.sum(exp_qs) + EPSILON)  # Avoid division by zero
    return torch.clamp(probs, min=0.0, max=1.0)  # Ensure valid probabilities

def choose_action_softmax(q_values):
    action_probs = softmax(q_values, temperature).detach().numpy()
    if np.any(np.isnan(action_probs)):
        print(f"NaN found in action probabilities: {action_probs}")
        action_probs = np.ones_like(action_probs) / len(action_probs)  # Fallback to uniform distribution
    return np.random.choice(len(action_space), p=action_probs)

def reward_function(new_portfolio_value, prev_portfolio_value, returns, window=WINDOW_SIZE):
    reward = (new_portfolio_value - prev_portfolio_value) / (prev_portfolio_value + EPSILON)
    if len(returns) >= window:
        returns_array = list(returns)[-window:]
        sharpe_ratio = calculate_sharpe_ratio(returns_array)
        reward += sharpe_ratio
    return reward

def calculate_sharpe_ratio(returns):
    average_return = np.mean(returns)  # Average returns
    excess_return = average_return - RISK_FREE_RATE  # Average return minus risk-free rate
    std_dev = np.std(returns) + EPSILON  # Standard deviation of returns
    return excess_return / std_dev  # Sharpe ratio calculation


def train_rl_agent(model, optimizer, loss_fn, df, discount_factor, start_date, end_date, initial_portfolio_value=10000):
    returns = deque(maxlen=WINDOW_SIZE)
    portfolio_value = initial_portfolio_value
    cash = initial_portfolio_value
    shares = 0
    
    train_data = df.loc[start_date:end_date]
    
    for episode in range(100):
        state = train_data.iloc[:WINDOW_SIZE].values
        episode_rewards = []
        
        for t in range(WINDOW_SIZE, len(train_data)):
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            q_values = model(state_tensor)
            action_idx = choose_action_softmax(q_values[0])
            action = action_space[action_idx]
            
            current_price = train_data.iloc[t]['Close']
            shares_to_trade = choose_action(action, portfolio_value, cash, shares, current_price)
            
            # Update portfolio
            if shares_to_trade > 0:  # Buying
                cash -= shares_to_trade * current_price
                shares += shares_to_trade
            else:  # Selling
                cash -= shares_to_trade * current_price  # Note: shares_to_trade is negative
                shares += shares_to_trade
            
            new_portfolio_value = cash + shares * current_price
            new_portfolio_value = np.clip(new_portfolio_value, 1, 1e10)

            reward = reward_function(new_portfolio_value, portfolio_value, returns)
            episode_rewards.append(reward)
            returns.append(new_portfolio_value / portfolio_value - 1)  # Store relative return

            next_state = train_data.iloc[t - WINDOW_SIZE + 1:t + 1].values
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
            next_q_values = model(next_state_tensor)
            target_q_values = reward + discount_factor * torch.max(next_q_values)
            
            current_q = q_values[0, action_idx]
            loss = loss_fn(current_q, target_q_values)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()

            state = next_state
            portfolio_value = new_portfolio_value

        avg_reward = np.mean(episode_rewards)
        print(f'Episode {episode + 1}/100 - Portfolio Value: {portfolio_value:.2f}, Avg Reward: {avg_reward:.4f}')

def trade(model, df, start_date, end_date, initial_portfolio_value=10000):
    """
    Performs trading on the specified data range and records trades and portfolio values.
    """
    portfolio_value = initial_portfolio_value
    cash = initial_portfolio_value
    shares = 0
    trades = []
    daily_values = []
    trading_temperature = 0.5  # Lower temperature for more deterministic trading

    trade_data = df.loc[start_date:end_date]
    if len(trade_data) <= WINDOW_SIZE:
        print(f"Error: Not enough data for trading. Available data points: {len(trade_data)}")
        return trades, daily_values

    state = trade_data.iloc[:WINDOW_SIZE].values

    for t in range(WINDOW_SIZE, len(trade_data)):
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        q_values = model(state_tensor)
        
        action_probs = softmax(q_values[0], trading_temperature).detach().numpy()
        action_idx = np.random.choice(len(action_space), p=action_probs)
        action = action_space[action_idx]
        
        current_price = trade_data.iloc[t]['Close']
        shares_to_trade = choose_action(action, portfolio_value, cash, shares, current_price)
        
        if shares_to_trade > 0 and cash >= shares_to_trade * current_price:  # Buy
            cash -= shares_to_trade * current_price
            shares += shares_to_trade
            trades.append(('buy', shares_to_trade, current_price))
            print(f"Day {trade_data.index[t]}: Bought {shares_to_trade} shares at {current_price:.2f}")
        elif shares_to_trade < 0 and shares >= abs(shares_to_trade):  # Sell
            cash -= shares_to_trade * current_price  # Remember shares_to_trade is negative
            shares += shares_to_trade
            trades.append(('sell', abs(shares_to_trade), current_price))
            print(f"Day {trade_data.index[t]}: Sold {abs(shares_to_trade)} shares at {current_price:.2f}")
        
        portfolio_value = cash + shares * current_price
        daily_values.append((trade_data.index[t], portfolio_value))
        state = trade_data.iloc[t - WINDOW_SIZE + 1:t + 1].values

    return trades, daily_values

def evaluate_performance(daily_values, initial_portfolio_value):
    """
    Evaluates performance metrics based on daily portfolio values.
    """
    start_value = daily_values[0][1]
    end_value = daily_values[-1][1]
    total_return = (end_value - start_value) / start_value * 100

    # Calculate daily returns and Sharpe ratio
    returns = [v[1] / daily_values[i-1][1] - 1 for i, v in enumerate(daily_values) if i > 0]
    sharpe_ratio = calculate_sharpe_ratio(returns)

    # Calculate max drawdown as a percentage
    peak = daily_values[0][1]
    max_drawdown = 0
    for _, value in daily_values:
        peak = max(peak, value)
        drawdown = (peak - value) / peak
        max_drawdown = max(max_drawdown, drawdown)
    max_drawdown_percent = max_drawdown * 100

    # Calculate Profit-Loss Ratio (using total PnL and total trades)
    total_pnl = end_value - initial_portfolio_value
    total_trades = len([t for t in trades if t[0] == 'buy']) + len([t for t in trades if t[0] == 'sell'])
    profit_loss_ratio = total_pnl / (total_trades * initial_portfolio_value / 100) if total_trades > 0 else 0

    return total_return, sharpe_ratio, max_drawdown_percent, profit_loss_ratio

# Function to save the trained model and optimizer
def save_agent(model, optimizer, file_path="agent.pth"):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, file_path)
    print(f"Agent saved successfully to {file_path}")

if __name__ == "__main__":
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    
    # Load training data and define model, optimizer, and loss function
    df = load_data()
    model = LSTMTrader(input_size=df.shape[1], hidden_size=64, output_size=len(action_space))
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()
    
    # Train the model
    print("Starting training phase...")
    train_rl_agent(model, optimizer, loss_fn, df, discount_factor=0.99, 
                   start_date='2020-01-01', end_date='2023-01-01')
    
    # After training, save the agent
    save_agent(model, optimizer, file_path="saved_models/QLaapl200_agent.pth")

    
    # Load test data
    test_data = load_data(start_date='2023-01-01', end_date='2024-01-01')
    
    # Test the model and calculate metrics on test data
    print("\nStarting trading phase on test data...")
    trades, daily_values = trade(model, test_data, start_date='2023-01-01', end_date='2024-01-01')
    
    if not daily_values:
        print("Error: No trades were executed during the trading period.")
    else:
        total_return, sharpe_ratio, max_drawdown, profit_loss_ratio = evaluate_performance(daily_values, 10000)
        
        print("\nEvaluation Metrics on Test Data:")
        print(f"Total Return (%): {total_return:.2f}")
        print(f"Sharpe Ratio: {sharpe_ratio:.2f}")
        print(f"Max Drawdown (%): {max_drawdown:.2f}")
        print(f"Profit-Loss Ratio: {profit_loss_ratio:.2f}")

[*********************100%***********************]  1 of 1 completed


Starting training phase...
Episode 1/100 - Portfolio Value: 11477.13, Avg Reward: -1342.3074
Episode 2/100 - Portfolio Value: 8050.27, Avg Reward: -1.2908
Episode 3/100 - Portfolio Value: 10732.82, Avg Reward: -1.3523
Episode 4/100 - Portfolio Value: 8562.96, Avg Reward: -1.6683
Episode 5/100 - Portfolio Value: 8164.82, Avg Reward: -1.6391
Episode 6/100 - Portfolio Value: 12853.12, Avg Reward: -1.4402
Episode 7/100 - Portfolio Value: 10044.91, Avg Reward: -1.3134
Episode 8/100 - Portfolio Value: 13961.31, Avg Reward: -1.4475
Episode 9/100 - Portfolio Value: 23523.83, Avg Reward: -1.2750
Episode 10/100 - Portfolio Value: 23207.08, Avg Reward: -1.2296
Episode 11/100 - Portfolio Value: 22462.99, Avg Reward: -1.2518
Episode 12/100 - Portfolio Value: 19746.82, Avg Reward: -1.4799
Episode 13/100 - Portfolio Value: 17178.98, Avg Reward: -1.7937
Episode 14/100 - Portfolio Value: 19223.99, Avg Reward: -1.4225
Episode 15/100 - Portfolio Value: 14484.05, Avg Reward: -1.7311
Episode 16/100 - Portf

[*********************100%***********************]  1 of 1 completed



Starting trading phase on test data...
Day 2023-01-23 00:00:00: Bought 70.0 shares at 141.11
Day 2023-01-31 00:00:00: Sold 35.0 shares at 144.29
Day 2023-02-03 00:00:00: Sold 33.0 shares at 154.50
Day 2023-02-06 00:00:00: Bought 6.0 shares at 151.73
Day 2023-02-07 00:00:00: Sold 8.0 shares at 154.65
Day 2023-02-10 00:00:00: Bought 70.0 shares at 151.01
Day 2023-02-17 00:00:00: Sold 7.0 shares at 152.55
Day 2023-02-21 00:00:00: Sold 36.0 shares at 148.48
Day 2023-02-24 00:00:00: Sold 27.0 shares at 146.71
Day 2023-03-09 00:00:00: Bought 6.0 shares at 150.59
Day 2023-03-13 00:00:00: Sold 6.0 shares at 150.47
Day 2023-03-28 00:00:00: Bought 32.0 shares at 157.65
Day 2023-03-29 00:00:00: Sold 32.0 shares at 160.77
Day 2023-03-30 00:00:00: Bought 6.0 shares at 162.36
Day 2023-04-03 00:00:00: Bought 6.0 shares at 166.17
Day 2023-04-06 00:00:00: Bought 6.0 shares at 164.66
Day 2023-04-10 00:00:00: Bought 32.0 shares at 162.03
Day 2023-04-12 00:00:00: Bought 6.0 shares at 160.10
Day 2023-04-1