In [None]:
import numpy as np
import random
from collections import deque
import torch
import torch.optim as optim

class DQNTrainer:
    def __init__(self, model, target_model, device='cuda'):
        self.device = device
        self.model = model.to(device)
        self.target_model = target_model.to(device)
        self.optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)
        self.loss_fn = torch.nn.SmoothL1Loss()  # Huber loss
        
        # Match notebook's hyperparameters
        self.memory = deque(maxlen=10000)
        self.batch_size = 128
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.997
        self.tau = 0.001  # For soft target updates
        self.transaction_cost = 0.0001  # 0.01% per transaction
        
        # Technical indicators from notebook
        self.feature_order = [
            'MA50', 'RSI', 'MACD', 'BB_upper', 'BB_lower', 
            'ADX', 'CCI', 'ATR', 'ROC', 'OBV'
        ]

    def _normalize_features(self, state):
        # Normalization logic matching notebook's approach
        normalized = torch.FloatTensor([
            state['MA50'] / state['close'],    # Moving average ratio
            state['RSI'] / 100,                # RSI normalized 0-1
            state['MACD'] * 100,               # MACD scaled
            (state['close'] - state['BB_lower']) / (state['BB_upper'] - state['BB_lower'] + 1e-8),
            state['ADX'] / 100,                # ADX normalized
            state['CCI'] / 200,                # CCI scaled
            state['ATR'] / state['close'],     # ATR ratio
            state['ROC'] / 100,                # ROC percentage
            state['OBV'] / 1e6                 # OBV scaled
        ]).to(self.device)
        return normalized

    def _calculate_reward(self, current_price, next_price, action, position):
        # Match notebook's reward calculation
        price_change = (next_price - current_price) / current_price
        fee = self.transaction_cost
        
        if action == 0:  # Buy
            reward = price_change - fee
        elif action == 1:  # Sell
            reward = -price_change - fee
        else:  # Hold
            reward = price_change * position
            
        return reward * 100  # Scaling factor from notebook

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((
            self._normalize_features(state),
            action,
            reward,
            self._normalize_features(next_state) if not done else None,
            done
        ))

    def act(self, state, current_position):
        if np.random.rand() <= self.epsilon:
            return random.choice([0, 1, 2])  # Buy, Sell, Hold
        
        state_tensor = self._normalize_features(state)
        with torch.no_grad():
            q_values = self.model(state_tensor)
            return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return None
        
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        # Convert to tensors
        states = torch.stack(states)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.stack([s for s in next_states if s is not None])
        dones = torch.BoolTensor(dones).to(self.device)

        # Current Q values
        current_q = self.model(states).gather(1, actions.unsqueeze(1))

        # Target Q values
        with torch.no_grad():
            next_q = self.target_model(next_states).max(1)[0]
            target_q = rewards + (1 - dones.float()) * self.gamma * next_q

        # Compute loss
        loss = self.loss_fn(current_q.squeeze(), target_q)

        # Optimize model
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()

        # Soft target network update
        for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        return loss.item()

    def save_checkpoint(self, path):
        torch.save({
            'model_state': self.model.state_dict(),
            'target_state': self.target_model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, path)

    def load_checkpoint(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state'])
        self.target_model.load_state_dict(checkpoint['target_state'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']


In [None]:
# Initialize with input size matching technical indicators
from dqnModel import DQN, TargetNetwork
input_size = 9  # Number of normalized features
model = DQN(input_size)
target_model = TargetNetwork(model)
trainer = DQNTrainer(model, target_model)

# Training loop matching notebook structure
for episode in range(1000):
    state = env.reset()
    position = 0  # Track current position
    total_reward = 0
    
    while True:
        action = trainer.act(state, position)
        next_state, price_change, done = env.step(action)
        
        # Update position based on action
        if action == 0: position = 1    # Buy
        elif action == 1: position = -1 # Sell
        # Hold maintains current position
        
        reward = trainer._calculate_reward(
            state['close'], 
            next_state['close'],
            action,
            position
        )
        
        trainer.remember(state, action, reward, next_state, done)
        loss = trainer.replay()
        
        total_reward += reward
        state = next_state
        
        if done:
            break
            
    if episode % 100 == 0:
        trainer.save_checkpoint(f'dqn_checkpoint_{episode}.pth')
