# Importing Libraries and Helper Classes and Functions

In [None]:
import sys
import os
import json
import numpy as np
from tqdm import tqdm
import torch
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)
from utils_data import  generateTargetDataBuySide, generateTargetDataSellSide, getTechnicalIndicators, normalize_dataframe_with_mean_std
from utils_data import UpstoxStockDataFetcher
import pandas as pd

# Loading Data

In [None]:
fetcher = UpstoxStockDataFetcher()
stock_symbol = "SUZLON"
start_date = "2025-06-15"
end_date = "2025-06-18"
df = fetcher.get_stock_data(stock_symbol, start_date, end_date)
df


In [None]:
df_today = pd.read_csv(r"C:\Users\srija\Assignment\Trading\json_files\suzlon_2025-06-19.csv")
df_today = df_today.drop_duplicates()
df_today['time'] = df_today['time'].astype(str) + "+05:30"
df_today['time'] = pd.to_datetime(df_today['time'])
df = pd.concat([df, df_today], ignore_index=True)
df


In [None]:
df = getTechnicalIndicators(df)
target_buy = generateTargetDataBuySide(df,1.005,0.99)
target_sell = generateTargetDataSellSide(df,0.995,1.01)

# Data Analysis

In [None]:
count1 = len(target_buy[target_buy['action'].isin(['End of Day'])])
count2 = len(target_buy[target_buy['action'].isin(['Target Hit'])])
count3 = len(target_buy[target_buy['action'].isin(['Stop Loss Hit'])])

print(f"End of Day: {count1}")
print(f"Target Hit: {count2}")
print(f"Stop Loss Hit: {count3}")

In [None]:
count4 = len(target_sell[target_sell['action'].isin(['End of Day'])])
count5 = len(target_sell[target_sell['action'].isin(['Target Hit'])])
count6 = len(target_sell[target_sell['action'].isin(['Stop Loss Hit'])])

print(f"End of Day: {count4}")
print(f"Target Hit: {count5}")
print(f"Stop Loss Hit: {count6}")

# Normalizing and Storing Params

In [None]:
df_normalized, norm_param = normalize_dataframe_with_mean_std(df)

# Storing Norm parameters to be used later for inference
with open("C:/Users/srija/Assignment/Trading/json_files/suzlon_14_june_norm_params.json", "w") as f:
    json.dump(norm_param, f)
df_normalized

# Loading Models

In [None]:
from Models.DQN import DQN,DQNAgent
from trading_environment import StockTradingEnv

policy_net = DQN(16, 3)
target_net = DQN(16, 3)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

In [None]:
def get_state(df, current_step):
    row = df.iloc[current_step]
    state = np.array([
        row['time'],
        row['open'],
        row['high'],
        row['low'],
        row['close'],
        row['volume'],
        row['MA50'],
        row['RSI'],
        row['MACD'],
        row['BB_upper'],
        row['BB_lower'],
        row['ADX'],
        row['CCI'],
        row['ATR'],
        row['ROC'],
        row['OBV']
    ], dtype=np.float32)
    return state


In [None]:
save_folder = r"C:\Users\srija\Assignment\Trading\Models\trained_models\suzlon_14_june"
model_path = r"C:\Users\srija\Assignment\Trading\Models\trained_models\suzlon_14_june\suzlon_14_june_1496.pth"
policy_net.load_state_dict(torch.load(model_path))
policy_net.train()
env = StockTradingEnv(df_normalized)
agent = DQNAgent(env, policy_net, target_net)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
policy_net.to(device)
target_net.to(device)

# Defining Reward Function

In [None]:
def calculate_optimized_scalping_reward(delay, action_type, success_base_reward=1500, 
                                     failure_base_penalty=1000, min_delay_threshold=60, 
                                     max_reward=2500, decay_rate=0.3,
                                     opportunity_cost_factor=0.2,
                                     missed_opp_multiplier=2.0,
                                     consecutive_successes=0, 
                                     consecutive_success_bonus=0.15):
    """
    Comprehensive reward function optimized for scalping.
    """
    delay = delay/60
    if action_type == 'success':
        # Delay-dependent base reward scaling
        if delay <= min_delay_threshold:
            base_reward = max_reward - (max_reward - success_base_reward) * (delay / min_delay_threshold)
        else:
            base_reward = success_base_reward
        
        # Apply exponential decay
        reward = base_reward * np.exp(-decay_rate * delay)
        
        # Apply opportunity cost
        opportunity_cost = opportunity_cost_factor * delay * success_base_reward
        opportunity_cost = min(opportunity_cost, reward * 0.8)
        reward = reward - opportunity_cost
        
        # Apply sequential bonus
        if consecutive_successes > 0:
            sequential_bonus = reward * (consecutive_success_bonus * consecutive_successes)
            reward += sequential_bonus
        
        return reward
    
    elif action_type == 'failure':
        # Standard penalty with exponential decay
        penalty = -failure_base_penalty * np.exp(-decay_rate * delay)
        
        # Add opportunity cost to penalty
        opportunity_cost = opportunity_cost_factor * delay * failure_base_penalty
        penalty = penalty - opportunity_cost
        
        return penalty
    
    elif action_type == 'missed_opportunity':
        # Enhanced penalty for missed opportunities
        missed_penalty = -failure_base_penalty * missed_opp_multiplier * np.exp(-decay_rate * delay)
        return missed_penalty
    
    elif action_type == 'no_action':
        # Reward for correctly staying out of the market
        return 100


# Training the Model 

In [None]:
num_episodes = 3000

for episode in range(1500,num_episodes):
    # state = env.reset()
    total_reward = 0
    number_trans = 0
    wins =0
    lose = 0
    defeat =0
    consecutive_success = 0
    pbar = tqdm(total = len(df_normalized))
    step =0
    next_step = 0

    while step<len(df_normalized):
    
        
        state = get_state(df_normalized,step)
        action = agent.select_action(state)
        done = False
        # print(action)
       
        if action ==1: ## BUY
            next_state = target_buy.iloc[step]
            
            next_state_index = next_state["next_state_index"]
            
            next_state2 = df_normalized.iloc[next_state_index].copy()

            reward = 0
            if(next_state['action']=="Target Hit"):
                wins +=1
                consecutive_success+=1
                # reward = 1000/(target_buy.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_buy.iloc[step]['delay'],action_type="success",consecutive_successes=consecutive_success)
                
              
            elif next_state['action']=="Stop Loss Hit":
                defeat +=1
                consecutive_success=0
                # reward = -1000/(target_buy.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_buy.iloc[step]['delay'],action_type="failure",consecutive_successes=consecutive_success)
                
                
            elif next_state['action']=="End of Day":
                lose+=1
                consecutive_success=0
                done = True
                # reward = -50
                reward = calculate_optimized_scalping_reward(delay=target_buy.iloc[step]['delay'],action_type="failure",consecutive_successes=consecutive_success)
                
               
            reward = float(reward)  # Convert to scalar float
            
            next_state2 = np.array(next_state2.values, dtype=np.float32)
            agent.store_transition(state, action, reward, next_state2, done)
            agent.update_policy()
            number_trans +=1
            next_step = next_state_index+1
        
        if action==2: ## Sell  short
            next_state = target_sell.iloc[step]
            next_state_index = next_state["next_state_index"]
            
            next_state2 = df_normalized.iloc[next_state_index].copy()
            reward = 0
            if(next_state['action']=="Target Hit"):
                wins +=1
                consecutive_success+=1
                # reward = 1000/(target_sell.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_sell.iloc[step]['delay'],action_type="success",consecutive_successes=consecutive_success)
                
               
            elif next_state['action']=="Stop Loss Hit":
                consecutive_success=0
                defeat +=1
                # reward = -1000/(target_sell.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_sell.iloc[step]['delay'],action_type="failure",consecutive_successes=consecutive_success)
                
               
            elif next_state['action']=="End of Day":
                consecutive_success=0
                lose+=1
                done =True
                # reward = -50
                reward = calculate_optimized_scalping_reward(delay=target_sell.iloc[step]['delay'],action_type="failure",consecutive_successes=consecutive_success)
                
                
            reward = float(reward)  # Convert to scalar float
           
            next_state2 = np.array(next_state2.values, dtype=np.float32)
            
            agent.store_transition(state, action, reward, next_state2, done)
            agent.update_policy()
            number_trans +=1
            next_step = next_state_index+1
        if action ==0:
            buy_side = target_buy.iloc[step].copy()
            sell_side = target_sell.iloc[step].copy()
            if buy_side['action']=="Target Hit":
                # reward = -1000/(target_buy.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_buy.iloc[step]['delay'],action_type="missed_opportunity",consecutive_successes=consecutive_success)     
            elif sell_side['action']=="Target Hit":
                # reward = -1000/(target_sell.iloc[step]['delay']+1)
                reward = calculate_optimized_scalping_reward(delay=target_sell.iloc[step]['delay'],action_type="missed_opportunity",consecutive_successes=consecutive_success)
            else:
                reward = 100
            if step+1 < len(df_normalized):
               
                next_state = get_state(df_normalized,step+1)
                
                reward = float(reward)  # Convert to scalar float
                agent.store_transition(state, action, reward, next_state, done)
                agent.update_policy()
            else:
                done = True
                next_state = get_state(df_normalized,-1)
                reward = float(reward)  # Convert to scalar float
                agent.store_transition(state, action, reward, next_state, done)
                agent.update_policy()
            next_step = step+1
            
        pbar.update(next_step - step)
        step = next_step
    pbar.close()
                    
    
    # Update the target network
    if episode % 5 == 0:
        target_net.load_state_dict(policy_net.state_dict())
        model_save_path = os.path.join(save_folder, f'suzlon_14_june_{episode+1}.pth')
        print(f'Episode {episode + 1}')
        print(f'Number of transcations: {number_trans}, Wins: {wins}, Lose: {lose}, Defeat: {defeat} ')

        # Save the model state_dict
        torch.save(policy_net.state_dict(), model_save_path)       
