In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import time

from collections import deque
import time
import random
from tqdm import tqdm
import os
from PIL import Image
import cv2
import pandas as pd


In [2]:
class DQN(nn.Module):
	def __init__(self, input_dim, output_dim):
		super(DQN, self).__init__()
		self.out_steps = output_dim

		self.lstm = nn.LSTM(input_size=input_dim, hidden_size=8, batch_first=True)
		self.dropout = nn.Dropout(p=0.2)
		self.fc4 = nn.Linear(8, output_dim)

	def forward(self, x):
		#_, (h_n, _) = self.lstm(x)  # h_n: [1, batch, lstm_units]
		h_n, _ = self.lstm(x) 
		h_n = h_n.squeeze(0)

		x = self.fc4(h_n)
		x = x.view(-1, self.out_steps, 1)
		return x



# W = (96 + 8)*2 * log(96 + 8) 

class DQNAgent:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.observarion_space = 96
        self.action_space = 3
        
        self.model = DQN(self.observarion_space, self.action_space).to(self.device)
        self.target_model = DQN(self.observarion_space, self.action_space).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        #self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001, weight_decay=1e-2)
        self.loss_fn = nn.MSELoss()


        self.REPLAY_MEMORY_SIZE = 50000
        self.MIN_REPLAY_MEMORY_SIZE = 300 
        self.replay_memory = deque(maxlen=self.REPLAY_MEMORY_SIZE)
        self.target_update_counter = 0
        
        #model setings
        self.UPDATE_TARGET_EVERY = 2
        self.MINIBATCH_SIZE = 16
        self.DISCOUNT = 0.99
        
        self.AGGREGATE_STATS_EVERY = 10
        
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def train(self, terminal_state):
        if len(self.replay_memory) < self.MIN_REPLAY_MEMORY_SIZE:
            return

        minibatch = random.sample(self.replay_memory, self.MINIBATCH_SIZE)

        # Rozpakowanie danych
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states_v = torch.from_numpy(np.array(states)).float().to(self.device)
        next_states_v = torch.from_numpy(np.array(next_states)).float().to(self.device)
        actions_v = torch.tensor(actions, dtype=torch.int64, device=self.device)
        rewards_v = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        dones_v = torch.tensor(dones, dtype=torch.bool, device=self.device)        
        
        with torch.no_grad():
            target_qs = self.target_model(next_states_v).flatten(start_dim=1)
            max_future_qs = torch.max(target_qs, dim=1)[0]
            new_qs = rewards_v + (~dones_v * self.DISCOUNT * max_future_qs)

        current_qs = self.model(states_v).flatten(start_dim=1)
        predicted_qs = current_qs.gather(1, actions_v.unsqueeze(1)).squeeze()

        loss = self.loss_fn(predicted_qs, new_qs)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if terminal_state:
            self.target_update_counter += 1

        if self.target_update_counter > self.UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_update_counter = 0

    def get_qs(self, state):
        state_v = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            qs = self.model(state_v)
        return qs #.cpu().numpy()[0]


In [3]:
from rl_agent import load_dqn_agent
trader = DQNAgent()

load_dqn_agent(trader, 'aapl_best_agent_vc_dimOPT.pth')

Model załadowany z aapl_best_agent_vc_dimOPT.pth


  checkpoint = torch.load(filename, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))


In [4]:
from source.database import read_stock_data


ticker = 'AAPL'
train_df, val_df ,rl_df,test_df = read_stock_data(ticker)
training_set = pd.concat([train_df, val_df ,rl_df,test_df])
training_set

Unnamed: 0,ticker,date,open,high,low,close,volume,train_split
0,AAPL,2024-09-30 09:30:00,229.23,231.54,228.85,230.73,4165945,train
1,AAPL,2024-09-30 09:45:00,230.75,231.09,229.91,230.12,1293996,train
2,AAPL,2024-09-30 10:00:00,230.12,230.52,229.73,229.82,1040607,train
3,AAPL,2024-09-30 10:15:00,229.83,230.19,229.50,230.17,1166165,train
4,AAPL,2024-09-30 10:30:00,230.16,230.69,229.93,230.28,917249,train
...,...,...,...,...,...,...,...,...
474,AAPL,2025-06-26 14:45:00,201.02,201.15,200.88,201.04,659472,test
475,AAPL,2025-06-26 15:00:00,201.03,201.05,200.80,200.84,623028,test
476,AAPL,2025-06-26 15:15:00,200.84,201.18,200.75,201.18,846185,test
477,AAPL,2025-06-26 15:30:00,201.18,201.56,200.99,201.16,1205372,test


In [22]:
import gym
import numpy as np
import pandas as pd
from gym import spaces

class PortfolioEnv(gym.Env):
    def __init__(self, close_data: pd.DataFrame, window_size=96, initial_cash=100000.0, transaction_cost=0.001):
        """
        close_data: DataFrame z kolumnami = aktywa (np. ['AAPL', 'GOOG']), indeks = daty
        window_size: ile dni wstecz jest obserwowane
        """
        super(PortfolioEnv, self).__init__()

        self.close_data = close_data
        self.assets = 1 #close_data.columns.tolist()
        self.n_assets = 1 #len(self.assets)
        self.window_size = window_size
        self.initial_cash = initial_cash
        self.transaction_cost = transaction_cost

        # Akcje = nowa alokacja portfela (ciągłe)
        self.action_space = spaces.Box(low=0, high=1, shape=(self.n_assets,), dtype=np.float32)

        # Obserwacja = ostatnie close'y (window_size × n_assets) + obecna alokacja + poprzednia alokacja
        obs_len = self.window_size * self.n_assets + 2 * self.n_assets
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(obs_len,), dtype=np.float32)

        self.reset()

    def _get_obs(self):
        # Fragment danych close z okna czasowego
        window = self.close_data[self.current_step - self.window_size:self.current_step]
        #market_flat = window.values.flatten()
        return np.concatenate([window, self.portfolio, self.prev_action])

    def reset(self):
        self.current_step = self.window_size
        self.portfolio_value = self.initial_cash
        self.portfolio = np.array([1.0 / self.n_assets] * self.n_assets)  # równomierna alokacja
        self.prev_action = self.portfolio.copy()
        return self._get_obs()

    def step(self, action):
        action = np.clip(action, 0, 1)
        action = action / (np.sum(action) + 1e-8)  # normalizacja

        prev_prices = self.close_data[self.current_step - 1]
        curr_prices = self.close_data[self.current_step]
        returns = curr_prices / prev_prices  # proste zwroty

        # Aktualizacja wartości portfela
        portfolio_return = np.dot(self.portfolio, returns)
        self.portfolio_value *= portfolio_return

        # Koszt zmiany alokacji
        trans_cost = self.transaction_cost * np.sum(np.abs(action - self.portfolio))

        reward = np.log(portfolio_return + 1e-8) - trans_cost

        self.prev_action = self.portfolio.copy()
        self.portfolio = action
        self.current_step += 1

        done = self.current_step >= len(self.close_data) - 1
        info = {'portfolio_value': self.portfolio_value}

        return self._get_obs(), reward, done, info


In [52]:
import gym
from gym import spaces
import numpy as np
import pandas as pd

class PortfolioEnv(gym.Env):
    def __init__(self, close_data: pd.DataFrame, window_size=96, initial_cash=100000.0, 
                 transaction_cost=0.001, max_allocation=0.5):
        """
        close_data: DataFrame z kolumnami = aktywa (np. ['AAPL', 'GOOG']), indeks = daty
        window_size: ile dni wstecz jest obserwowane  
        max_allocation: maksymalny % portfela do alokacji w jednej transakcji
        """
        super(PortfolioEnv, self).__init__()
        self.close_data = close_data
        self.assets = close_data.columns.tolist()
        self.n_assets = len(self.assets)
        self.window_size = window_size
        self.initial_cash = initial_cash
        self.transaction_cost = transaction_cost
        self.max_allocation = max_allocation
        
        # Akcje = dla każdego aktywa: [trading_action, allocation]
        # trading_action: 0=hold, 1=buy, 2=sell (dyskretne)
        # allocation: 0.0-1.0 (ciągłe, będzie przeskalowane do max_allocation)
        self.action_space = spaces.Dict({
            'trader': spaces.MultiDiscrete([3] * self.n_assets),  # buy/sell/hold dla każdego aktywa
            'portfolio_manager': spaces.Box(low=0, high=1, shape=(self.n_assets,), dtype=np.float32)  # allocation percentages
        })
        
        # Obserwacja = ostatnie close'y + obecne pozycje + gotówka + poprzednie akcje
        obs_len = self.window_size * self.n_assets + self.n_assets + 1 + self.n_assets + self.n_assets
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_len,), dtype=np.float32)
        
        self.reset()
    
    def _get_obs(self):
        # Fragment danych close z okna czasowego
        window_start = max(0, self.current_step - self.window_size)
        window = self.close_data.iloc[window_start:self.current_step]
        
        # Normalizacja cen (względem pierwszej ceny w oknie)
        if len(window) > 0:
            normalized_window = window / window.iloc[0]
            # Jeśli okno jest krótsze niż window_size, wypełnij zerami
            if len(normalized_window) < self.window_size:
                padding = np.zeros((self.window_size - len(normalized_window), self.n_assets))
                market_data = np.vstack([padding, normalized_window.values])
            else:
                market_data = normalized_window.values
        else:
            market_data = np.zeros((self.window_size, self.n_assets))
        
        market_flat = market_data.flatten()
        
        # Obecne pozycje (jako ułamek całkowitej wartości portfela)
        current_prices = self.close_data.iloc[self.current_step-1] if self.current_step > 0 else self.close_data.iloc[0]
        total_value = self.cash + np.sum(self.positions * current_prices)
        position_ratios = (self.positions * current_prices) / (total_value + 1e-8)
        
        # Gotówka jako ułamek całkowitej wartości
        cash_ratio = self.cash / (total_value + 1e-8)
        
        # Poprzednie akcje (znormalizowane)
        prev_trader_actions = self.prev_trader_actions.astype(np.float32) / 2.0  # 0-2 -> 0-1
        prev_allocations = self.prev_allocations
        
        return np.concatenate([
            market_flat, 
            position_ratios, 
            [cash_ratio],
            prev_trader_actions,
            prev_allocations
        ])
    
    def reset(self):
        self.current_step = self.window_size
        self.cash = self.initial_cash
        self.positions = np.zeros(self.n_assets)  # liczba akcji każdego aktywa
        self.portfolio_value_history = []
        self.prev_trader_actions = np.zeros(self.n_assets)
        self.prev_allocations = np.zeros(self.n_assets)
        return self._get_obs()
    
    def step(self, action):
        if self.current_step >= len(self.close_data):
            return self._get_obs(), 0, True, {}
        
        # Rozpakuj akcje
        trader_actions = action['trader']
        allocation_percentages = action['portfolio_manager']
        
        # Przeskaluj alokacje do max_allocation
        allocation_percentages = np.clip(allocation_percentages, 0, 1) * self.max_allocation
        
        curr_prices = self.close_data.iloc[self.current_step].values
        prev_prices = self.close_data.iloc[self.current_step - 1].values
        
        # Oblicz wartość portfela przed transakcjami
        prev_portfolio_value = self.cash + np.sum(self.positions * prev_prices)
        
        total_transaction_cost = 0
        executed_actions = []
        
        # Wykonaj akcje dla każdego aktywa
        for i, (trader_action, allocation) in enumerate(zip(trader_actions, allocation_percentages)):
            action_info = {
                'asset': self.assets[i],
                'trader_action': trader_action,
                'requested_allocation': allocation,
                'executed': False,
                'shares_traded': 0,
                'cost': 0
            }
            
            if trader_action == 1:  # BUY
                # Kup akcje za allocation % obecnej wartości portfela
                current_portfolio_value = self.cash + np.sum(self.positions * curr_prices)
                investment_amount = current_portfolio_value * allocation
                
                if investment_amount <= self.cash and investment_amount > 0:
                    shares_to_buy = investment_amount / curr_prices[i]
                    transaction_cost = investment_amount * self.transaction_cost
                    
                    self.positions[i] += shares_to_buy
                    self.cash -= (investment_amount + transaction_cost)
                    total_transaction_cost += transaction_cost
                    
                    action_info.update({
                        'executed': True,
                        'shares_traded': shares_to_buy,
                        'cost': transaction_cost,
                        'investment_amount': investment_amount
                    })
            
            elif trader_action == 2:  # SELL
                # Sprzedaj allocation % obecnych pozycji
                shares_to_sell = self.positions[i] * allocation
                
                if shares_to_sell > 0:
                    sale_amount = shares_to_sell * curr_prices[i]
                    transaction_cost = sale_amount * self.transaction_cost
                    
                    self.positions[i] -= shares_to_sell
                    self.cash += (sale_amount - transaction_cost)
                    total_transaction_cost += transaction_cost
                    
                    action_info.update({
                        'executed': True,
                        'shares_traded': -shares_to_sell,  # negative for sell
                        'cost': transaction_cost,
                        'sale_amount': sale_amount
                    })
            
            # trader_action == 0 (HOLD) - nic nie rób
            executed_actions.append(action_info)
        
        # Oblicz wartość portfela po transakcjach
        curr_portfolio_value = self.cash + np.sum(self.positions * curr_prices)
        
        # Oblicz zwrot
        portfolio_return = (curr_portfolio_value - prev_portfolio_value) / prev_portfolio_value
        
        # Reward = log return - koszty transakcji - penalty za nieudane transakcje
        base_reward = np.log(1 + portfolio_return + 1e-8)
        cost_penalty = total_transaction_cost / prev_portfolio_value
        
        # Penalty za nieudane transakcje (gdy trader chciał coś zrobić ale się nie udało)
        failed_actions = sum(1 for action in executed_actions 
                           if action['trader_action'] != 0 and not action['executed'])
        failure_penalty = failed_actions * 0.01  # małe penalty
        
        reward = base_reward - cost_penalty - failure_penalty
        
        # Zapisz poprzednie akcje
        self.prev_trader_actions = trader_actions.copy()
        self.prev_allocations = allocation_percentages.copy()
        
        self.portfolio_value_history.append(curr_portfolio_value)
        self.current_step += 1
        
        done = self.current_step >= len(self.close_data) - 1
        
        info = {
            'portfolio_value': curr_portfolio_value,
            'cash': self.cash,
            'positions': self.positions.copy(),
            'transaction_cost': total_transaction_cost,
            'portfolio_return': portfolio_return,
            'executed_actions': executed_actions,
            'failed_actions': failed_actions
        }
        
        return self._get_obs(), reward, done, info
    
    def get_portfolio_allocation(self):
        """Zwraca obecną alokację portfela"""
        if self.current_step > 0:
            curr_prices = self.close_data.iloc[self.current_step - 1].values
            total_value = self.cash + np.sum(self.positions * curr_prices)
            
            # Alokacja dla każdego aktywa
            asset_values = self.positions * curr_prices
            allocation = asset_values / (total_value + 1e-8)
            
            # Alokacja gotówki
            cash_allocation = self.cash / (total_value + 1e-8)
            
            return {
                'assets': dict(zip(self.assets, allocation)),
                'cash': cash_allocation,
                'total_value': total_value,
                'positions': dict(zip(self.assets, self.positions))
            }
        return None
    
    def sample_action(self):
        """Przykładowa akcja do testowania"""
        return {
            'trader': self.action_space['trader'].sample(),
            'portfolio_manager': self.action_space['portfolio_manager'].sample()
        }

In [53]:

from enviroments import TimeSeriesEnv_simple

#data = training_set['close'].values
#data = None
data = training_set['close'].copy()
data[ticker] = training_set['close']
data = pd.DataFrame(data[ticker])


data_split = int(len(data)  * 0.8)

train_data = data[:data_split]
valid_data = data[data_split:]

WINDOW_SIZE = 96
env = PortfolioEnv(train_data, window_size=WINDOW_SIZE)
valid_env = PortfolioEnv(valid_data,window_size=WINDOW_SIZE)

In [55]:
env._get_obs()

array([1.        , 0.99735622, 0.996056  , 0.99757292, 0.99804967,
       1.00030338, 1.00108352, 1.00173363, 1.00221038, 1.00307719,
       1.00342392, 1.00251376, 1.00247042, 1.00108352, 0.99965327,
       1.00047675, 0.99982664, 1.00130022, 0.99986998, 0.99770294,
       1.00112686, 1.00091015, 0.99943657, 0.9986131 , 0.99934989,
       1.0062844 , 0.97863303, 0.97663936, 0.97334547, 0.97165518,
       0.97265202, 0.97312877, 0.97083171, 0.97382222, 0.97473237,
       0.97269536, 0.97299874, 0.97308542, 0.96966151, 0.97065835,
       0.96840463, 0.96714775, 0.96974819, 0.97230529, 0.97369219,
       0.97373554, 0.97377888, 0.9743423 , 0.97538248, 0.97672604,
       0.97685607, 0.97720279, 0.96593421, 0.9652841 , 0.96983487,
       0.97356217, 0.97568587, 0.97642266, 0.97707277, 0.97702943,
       0.98058337, 0.97971655, 0.97962987, 0.9801933 , 0.97958653,
       0.97884974, 0.98010662, 0.98023664, 0.97906644, 0.98088675,
       0.98067005, 0.97962987, 0.97967321, 0.9799766 , 0.98084

In [25]:
class DQNPortfolio(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQNPortfolio, self).__init__()
        self.out_steps = output_dim
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=32, batch_first=True)
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(32, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # x: [batch, seq_len, features]
        #last_hidden = lstm_out[:, -1, :]  # weź ostatni krok
        #x = self.dropout(last_hidden)
        x = self.fc(lstm_out)
        x = x.view(-1, self.out_steps, 1)
        x = torch.softmax(x, dim=1)  # alokacja portfela jako rozkład prawdopodobieństwa
        return x


class Agent_portfolio:
    def __init__(self, input_dim=96 + 1, action_dim=1):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.input_dim = input_dim
        self.action_dim = action_dim

        self.model = DQNPortfolio(input_dim, action_dim).to(self.device)
        self.target_model = DQNPortfolio(input_dim, action_dim).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001, weight_decay=1e-2)
        self.loss_fn = nn.MSELoss()

        self.replay_memory = deque(maxlen=50000)
        self.MIN_REPLAY_MEMORY_SIZE = 300
        self.UPDATE_TARGET_EVERY = 2
        self.MINIBATCH_SIZE = 16
        self.DISCOUNT = 0.99
        self.target_update_counter = 0
        

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def train(self, terminal_state):
        if len(self.replay_memory) < self.MIN_REPLAY_MEMORY_SIZE:
            return

        minibatch = random.sample(self.replay_memory, self.MINIBATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        # Tensor preparation
        states = torch.tensor(states, dtype=torch.float32).to(self.device)  # [B, seq_len, features]
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.int64).to(self.device)  # indeksy
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.bool).to(self.device)

        with torch.no_grad():
            target_qs = self.target_model(next_states)  # [B, A]
            max_future_qs = torch.max(target_qs, dim=1)[0]
            target = rewards + (~dones * self.DISCOUNT * max_future_qs)

        current_qs = self.model(states)  # [B, A]
        
        print("current_qs shape:", current_qs.shape)  # powinno być [B, A]
        print("actions shape:", actions.shape)        # [B]
        print("actions:", actions)                    # wartości muszą być w zakresie 0..A-1
        
        predicted = current_qs.gather(1, actions.unsqueeze(1)).squeeze(1)  # [B]

        loss = self.loss_fn(predicted, target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if terminal_state:
            self.target_update_counter += 1
        if self.target_update_counter >= self.UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_update_counter = 0

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)  # [1, seq, features]
        with torch.no_grad():
            action = self.model(state)
        return action.cpu().numpy()[0]


In [39]:
def evaluate_steps_portfolio(env, model, device="cuda:0"):
    state = env.reset()  # state = (close_prices, trader_decision)
    total_reward = 0
    done = False
    steps = 0

    while not done:
        close_prices, trader_action = state  # unpack input

        # Przygotuj dane wejściowe do modelu
        seq_len = close_prices.shape[0]

        # Rozszerz decyzję tradera na sekwencję
        trader_seq = np.tile(trader_action.flatten(), (seq_len, 1))  # shape: [seq_len, num_assets*3]

        # Połącz dane
        model_input = np.concatenate([close_prices, trader_seq], axis=1)  # [seq_len, features]
        model_input = torch.tensor(model_input, dtype=torch.float32, device=device).unsqueeze(0)  # [1, seq_len, features]

        with torch.no_grad():
            allocation = model(model_input)  # shape: [1, num_assets]
            allocation = allocation.squeeze(0).cpu().numpy()  # [num_assets]

        state, reward, done = env.step(allocation)
        total_reward += reward
        steps += 1

    return total_reward


In [41]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

reward_all = []
evaluate_revards = []

from copy import deepcopy
portfolio_manager = Agent_portfolio()
epsilon = 1

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import os

os.environ['CUDA_LAUNCH_BLOCKING']="1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

def add_trader_action(trader, current_state):
    current_state_tensor = torch.tensor(current_state, dtype=torch.float32).unsqueeze(0).to(trader.device)

    with torch.no_grad():
        q_values = trader.target_model(current_state_tensor)
        action = torch.argmax(q_values).item()

    current_state = np.round(current_state_tensor.cpu().numpy().flatten(),3).tolist()
    #print(current_state)
    current_state.append(action)
    return current_state #.append(action)    

#EPSILON_DECAY = 0.998
EPSILON_DECAY = 0.99

def train_episode(episode, epsilon):
    episode_reward = 0
    step = 1

    
    current_state = env.reset()

    #print(np.array(current_state, dtype='float32'))
    current_state = add_trader_action(trader, np.array(current_state, dtype='float32'))
    #print(current_state)
    done = False
    while not done:

        if np.random.rand() < epsilon:
            action = np.random.dirichlet(np.ones(1))
        else:
            action = portfolio_manager.get_action(current_state)

        new_state, reward, done = env.step(action)
        new_state = add_trader_action(trader, np.array(new_state, dtype='float32'))
        #print(len(new_state))
        episode_reward += reward
        portfolio_manager.update_replay_memory((current_state, action, reward, new_state, done))
        
        #if np.random.random() >= .7:
        portfolio_manager.train(done)

        current_state = new_state
        
        step += 1
 
    if not episode % 5:
            print(f"Episode: {episode} Total Reward: {env.total_profit} Epsilon: {epsilon:.2f}")

    return episode_reward

#super dla 200, batch64
EPISODES = 100
MIN_EPSILON = 0.001

# Iterate over episodes
max_agent = DQNAgent()
max_reward = 0
evaluate_every = 1
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
    reward = train_episode(episode,epsilon)
    
    
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

    
    reward_all.append(reward)
    #if episode % evaluate_every:
    valid_env.reset()
    reward_valid_dataset = evaluate_steps_portfolio(valid_env, agent.target_model)
    evaluate_revards.append(reward_valid_dataset)
    
    if reward_valid_dataset > max_reward and episode > 10:
        max_reward = reward_valid_dataset
        #print(max_reward)
        max_agent = deepcopy(agent)
    
    #print(reward_valid_dataset)
    if max_reward > 0 and episode > 10 and reward_valid_dataset / max_reward <= .7:
        agent = deepcopy(max_agent)

    
#bierz Q z target modelu
    

#przed opt - 18 min

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
