In [29]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data
import torch_geometric.nn as pyg_nn
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import os
from torch_geometric.nn import GATConv
import torch.nn.functional as F

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Subset
from collections import defaultdict

In [30]:
from models import HierarchicalStockDataset, ShortTermSequentialLearner, IntraSectorGAT ,AttentiveGRU, LongTermSequentialLearner, InterSectorGAT, EmbeddingFusion, FinGAT, MultiTaskLoss 


In [31]:
HIDDEN_SIZE = 16

In [32]:
model_path = 'fingat_model.pth'


model = FinGAT(
    attentive_dim=HIDDEN_SIZE,
    graph_dim=HIDDEN_SIZE,
    sector_dim=HIDDEN_SIZE
)

model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [33]:
df = pd.read_parquet('merged_stock_data_with_enhanced_features.parquet')
df.head()

df.index = pd.to_datetime(df.index)  

start_date = "2024-01-01"
end_date = "2024-03-22"


df_val_date  = df.loc[start_date:end_date]


In [34]:

val_dataset = HierarchicalStockDataset(df_val_date)
print(f"Validation dataset contains {val_dataset.__len__()} samples")

val_company_sequences = defaultdict(list)
sector_stock_map = defaultdict(set)
stock_sector_map = defaultdict(set)

for idx in range(len(val_dataset)):
    features, industry_id, company_id, return_ratio, movements = val_dataset[idx]
    company_id = company_id.item()
    industry_id = industry_id.item()
    
    val_company_sequences[company_id].append({
        'features': features,
        'industry_id': industry_id,
        'return_ratio': return_ratio.item(),
        'movements': movements.item(),
        'idx': idx
    })
    sector_stock_map[industry_id].add(company_id)
    stock_sector_map[company_id].add(industry_id)


val_attentive_embeddings = {}
val_stock_returns_map = defaultdict(list)
val_stock_movements_map = defaultdict(list)


stsl_model = ShortTermSequentialLearner(
    input_size=val_dataset[0][0].shape[1],  
    hidden_size=HIDDEN_SIZE
)
stsl_model.eval()

with torch.no_grad():
    for company_id, sequences in val_company_sequences.items():

        sequences.sort(key=lambda x: x['idx'])

        company_attentive_embeddings = []
        
        for seq in sequences:
            features = seq['features'].unsqueeze(0)
            industry_id = seq['industry_id']
            return_ratio = seq['return_ratio']
            movements = seq['movements']

            val_stock_returns_map[company_id].append(return_ratio)
            val_stock_movements_map[company_id].append(movements)

            context, _ = stsl_model(features)
            company_attentive_embeddings.append(context.squeeze(0).cpu().numpy())

        if company_attentive_embeddings:  
            val_attentive_embeddings[company_id] = np.stack(company_attentive_embeddings, axis=0)
print(f"Validation attentive embeddings processed: {len(val_attentive_embeddings)} companies")

# Function to evaluate on validation data
def evaluate_on_validation(trained_model):

    print("Creating validation embeddings from scratch...")
    global val_weekly_long_term_embeddings, val_weekly_inter_sector_embeddings

    val_weekly_intra_sector_graphs = defaultdict(lambda: defaultdict(list))

    max_val_weeks = max([len(emb) for emb in val_attentive_embeddings.values()]) if val_attentive_embeddings else 0

    for week_idx in range(max_val_weeks):


        for sector_id, sector_stocks in sector_stock_map.items():
            sector_features = []
            valid_indices = []

            for stock_idx in sector_stocks:
                if stock_idx in val_attentive_embeddings and week_idx < len(val_attentive_embeddings[stock_idx]):
                    tensor_embedding = torch.tensor(val_attentive_embeddings[stock_idx][week_idx], dtype=torch.float32)
                    sector_features.append(tensor_embedding)
                    valid_indices.append(stock_idx)

            if len(sector_features) >= 2:
                edge_index = []
                num_nodes = len(valid_indices)

                for i in range(num_nodes):
                    for j in range(num_nodes):
                        if i != j:
                            edge_index.append([i, j])

                val_weekly_intra_sector_graphs[week_idx][sector_id] = Data(
                    x=torch.stack(sector_features),
                    edge_index=torch.tensor(edge_index).t().contiguous(),
                    original_indices=valid_indices,
                    sector_id=sector_id
                )

    val_sector_embeddings = []
    gat_model = IntraSectorGAT()
    gat_model.eval()
    
    with torch.no_grad():
        for week_idx in val_weekly_intra_sector_graphs:
            for sector_id, graph in val_weekly_intra_sector_graphs[week_idx].items():
                graph = graph
                out = gat_model(graph)
                val_sector_embeddings.append({
                    'embeddings': out.cpu(),
                    'original_indices': graph.original_indices,
                    'sector_id': graph.sector_id,
                    'week_idx': week_idx
                })

    val_weekly_sector_pooled_embeddings = defaultdict(dict)
    
    if val_sector_embeddings:
        embedding_dim = val_sector_embeddings[0]['embeddings'].shape[1]
        
        for week_idx in sorted(val_weekly_intra_sector_graphs.keys()):
            for sector_id in val_weekly_intra_sector_graphs[week_idx]:
                graph = val_weekly_intra_sector_graphs[week_idx][sector_id]
                pooled_embedding = torch.max(graph.x, dim=0)[0]  
                val_weekly_sector_pooled_embeddings[week_idx][sector_id] = pooled_embedding

    val_weekly_inter_sector_embeddings = defaultdict(dict)
    
    if val_sector_embeddings:
        inter_sector_model = InterSectorGAT(
            in_channels=embedding_dim,
            hidden_channels=HIDDEN_SIZE
        )
        inter_sector_model.eval()
        
        with torch.no_grad():
            for week_idx in sorted(val_weekly_sector_pooled_embeddings.keys()):
                sectors_this_week = val_weekly_sector_pooled_embeddings[week_idx]

                num_sectors = len(sectors_this_week)
                if num_sectors < 2:
                    continue

                sector_ids = list(sectors_this_week.keys())
                sector_id_to_idx = {sector_id: idx for idx, sector_id in enumerate(sector_ids)}

                sector_features = torch.zeros((num_sectors, embedding_dim), dtype=torch.float32)

                for sector_id, idx in sector_id_to_idx.items():
                    sector_features[idx] = sectors_this_week[sector_id]

                edge_indices = []
                for i in range(num_sectors):
                    for j in range(num_sectors):
                        if i != j: 
                            edge_indices.append([i, j])
                
                edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

                sector_features = sector_features
                edge_index = edge_index
                inter_sector_embeddings = inter_sector_model(sector_features, edge_index)

                inter_sector_embeddings = inter_sector_embeddings.cpu()
                for sector_id, idx in sector_id_to_idx.items():
                    val_weekly_inter_sector_embeddings[week_idx][sector_id] = inter_sector_embeddings[idx]

    val_company_sector_embeddings = defaultdict(lambda: defaultdict(list))
    
    for sector_data in val_sector_embeddings:
        week_idx = sector_data['week_idx']
        original_indices = sector_data['original_indices']
        embeddings = sector_data['embeddings']
        
        for i, company_idx in enumerate(original_indices):
            val_company_sector_embeddings[company_idx][week_idx] = embeddings[i]

    val_weekly_long_term_embeddings = defaultdict(lambda: defaultdict(dict))
    SEQ_LENGTH = 4 
    
    if val_sector_embeddings:

        long_term_g = LongTermSequentialLearner(
            input_size=val_sector_embeddings[0]['embeddings'].shape[1],
            hidden_size=HIDDEN_SIZE,
            lookback_weeks=SEQ_LENGTH
        )
        
        long_term_a = LongTermSequentialLearner(
            input_size=next(iter(val_attentive_embeddings.values())).shape[1],
            hidden_size=HIDDEN_SIZE,
            lookback_weeks=SEQ_LENGTH
        )
        
        with torch.no_grad():

            if val_company_sector_embeddings:
                max_weeks = max([max(weekly_data.keys()) for weekly_data in val_company_sector_embeddings.values()])

                for current_week in range(SEQ_LENGTH, max_weeks + 1):
                    

                    for company_idx, weekly_data in val_company_sector_embeddings.items():

                        if current_week not in weekly_data:
                            continue

                        history_weeks = [w for w in range(current_week - SEQ_LENGTH, current_week)]
                        if not all(w in weekly_data for w in history_weeks):
                            continue

                        graph_window = []
                        for w in history_weeks:
                            if w in weekly_data:
                                graph_emb = torch.tensor(weekly_data[w]).unsqueeze(0)
                                graph_window.append(graph_emb)

                        if company_idx in val_attentive_embeddings:
                            attentive_seqs = val_attentive_embeddings[company_idx]
                            
                            if len(attentive_seqs) >= SEQ_LENGTH:
                                attentive_window = []
                                for i in range(SEQ_LENGTH):
                                    seq_idx = len(attentive_seqs) - SEQ_LENGTH + i
                                    if seq_idx >= 0 and seq_idx < len(attentive_seqs):
                                        att_emb = torch.tensor(attentive_seqs[seq_idx], dtype=torch.float32).unsqueeze(0)
                                        attentive_window.append(att_emb)
                                
                                if len(graph_window) == SEQ_LENGTH and len(attentive_window) == SEQ_LENGTH:
                                    tau_G = long_term_g(graph_window)
                                    tau_A = long_term_a(attentive_window)

                                    val_weekly_long_term_embeddings[current_week][company_idx] = {
                                        'graph': tau_G.cpu().numpy(),
                                        'attentive': tau_A.cpu().numpy()
                                    }

    val_data = {
        'attentive_embs': [],
        'graph_embs': [],
        'sector_embs': [],
        'returns': [],
        'movements': [],
        'companies': []
    }
    
  
    if val_weekly_long_term_embeddings:
        last_week = max(val_weekly_long_term_embeddings.keys())

        
        for company_idx, company_data in val_weekly_long_term_embeddings[last_week].items():

            if company_idx not in stock_sector_map or len(stock_sector_map[company_idx]) == 0:
                continue
                
            sector_id = list(stock_sector_map[company_idx])[0]

            if sector_id not in val_weekly_inter_sector_embeddings[last_week]:
                continue
                
   
            attentive_emb = torch.tensor(company_data['attentive'], dtype=torch.float32)
            graph_emb = torch.tensor(company_data['graph'], dtype=torch.float32)
            sector_emb = val_weekly_inter_sector_embeddings[last_week][sector_id]
            
       
            if company_idx in val_stock_returns_map and val_stock_returns_map[company_idx]:
                return_ratio = val_stock_returns_map[company_idx][0]
                movement = 1.0 if return_ratio > 0 else 0.0
                
           
                val_data['attentive_embs'].append(attentive_emb)
                val_data['graph_embs'].append(graph_emb)
                val_data['sector_embs'].append(sector_emb)
                val_data['companies'].append(company_idx)
                val_data['returns'].append(return_ratio)
                val_data['movements'].append(movement)
    
    
    if val_data['returns']:
        val_data['attentive_embs'] = torch.stack(val_data['attentive_embs'])
        val_data['graph_embs'] = torch.stack(val_data['graph_embs'])
        val_data['sector_embs'] = torch.stack(val_data['sector_embs'])
        val_data['returns'] = torch.tensor(val_data['returns'], dtype=torch.float32)
        val_data['movements'] = torch.tensor(val_data['movements'], dtype=torch.float32)
        
    
        trained_model.eval()
        with torch.no_grad():
        
            attentive_embs = val_data['attentive_embs']
            graph_embs = val_data['graph_embs']
            sector_embs = val_data['sector_embs']

            return_preds, movement_preds = trained_model(attentive_embs, graph_embs, sector_embs)
   
            return_preds_np = return_preds.cpu().numpy()
            return_targets_np = val_data['returns'].numpy()
            movement_preds_np = movement_preds.cpu().numpy()
            movement_targets_np = val_data['movements'].numpy()
            
 
            from scipy.stats import spearmanr
            corr, _ = spearmanr(return_preds_np, return_targets_np)
  
            threshold = 0.5
            binary_preds = [1 if p > threshold else 0 for p in movement_preds_np]
            accuracy = sum(p == t for p, t in zip(binary_preds, movement_targets_np)) / len(binary_preds)
 
            from sklearn.metrics import r2_score
            r2 = r2_score(return_targets_np, return_preds_np)
   
            pred_target_pairs = list(zip(return_preds_np, return_targets_np, val_data['companies']))
            pred_ranking = sorted(pred_target_pairs, key=lambda x: x[0], reverse=True)
            company_to_pred_rank = {company: i+1 for i, (_, _, company) in enumerate(pred_ranking)}
            
            true_ranking = sorted(
                [(target, company) for (_, target, company) in pred_target_pairs],
                key=lambda x: x[0], 
                reverse=True
            )
            
            mrr_values = {}
            top_k_values = [5, 10, 20, 30, 100]
            
            for k in top_k_values:
                mrr_at_k = []
                for j, (_, company) in enumerate(true_ranking[:min(k, len(true_ranking))]):
                    if company in company_to_pred_rank:
                        mrr_at_k.append(1.0 / company_to_pred_rank[company])
                
                mrr_values[k] = sum(mrr_at_k) / len(mrr_at_k) if mrr_at_k else 0

            print(f"\nValidation Metrics (using fresh embeddings):")
            print(f"Number of validation samples: {len(return_preds_np)}")
            print(f"Return prediction correlation: {corr:.4f}")
            print(f"Movement prediction accuracy: {accuracy:.4f}")
            
            
            print("\nMean Reciprocal Rank (MRR) Metrics:")
            for k in top_k_values:
                print(f"MRR@{k}: {mrr_values[k]:.4f}")
            irr_values = {}
            top_k_values = [5, 10, 20, 30, 100]  # Simplified: removed unnecessary conditional

            for k in top_k_values:
                # For each window size, sort predictions and get top and bottom k stocks
                pred_sorted_indices = np.argsort(return_preds_np)[::-1]  # FIXED: Use return_preds_np
                
                # Get actual returns for top k and bottom k predicted stocks
                if len(pred_sorted_indices) >= k*2:
                    top_k_returns = [return_targets_np[i] for i in pred_sorted_indices[:k]]  # FIXED: Use return_targets_np
                    bottom_k_returns = [return_targets_np[i] for i in pred_sorted_indices[-k:]]
                    
                    # Calculate metrics
                    top_k_mean = np.mean(top_k_returns)
                    bottom_k_mean = np.mean(bottom_k_returns)
                    excess_return = top_k_mean - bottom_k_mean
                    
                    # Calculate tracking error as the standard deviation of return differences
                    if len(top_k_returns) > 1:
                        tracking_error = np.std(np.array(top_k_returns) - np.array(bottom_k_returns))
                        ir = excess_return / tracking_error if tracking_error > 0 else 0
                    else:
                        ir = 0
                        
                    irr_values[k] = {
                        'ir': ir,
                        'top_returns': top_k_mean,
                        'bottom_returns': bottom_k_mean,
                        'excess_return': excess_return
                    }

            # Print IRR metrics
            print("\nInformation Ratio (IRR) Metrics:")
            for k in top_k_values:
                if k in irr_values:
                    print(f"IRR@{k}: {irr_values[k]['ir']:.4f} (Excess Return: {irr_values[k]['excess_return']:.4f}, "
                        f"Top-{k} Avg: {irr_values[k]['top_returns']:.4f}, Bottom-{k} Avg: {irr_values[k]['bottom_returns']:.4f})")
                        
            return corr, accuracy, r2, mrr_values
    else:
        print("No validation data available for evaluation")
        return None, None, None, None
    

print("\n--- Evaluating on Validation Data with Fresh Embeddings ---")
val_metrics = evaluate_on_validation(model)
print("\n--- Evaluation Complete ---")


Validation dataset contains 22491 samples
Validation attentive embeddings processed: 441 companies

--- Evaluating on Validation Data with Fresh Embeddings ---
Creating validation embeddings from scratch...


  graph_emb = torch.tensor(weekly_data[w]).unsqueeze(0)



Validation Metrics (using fresh embeddings):
Number of validation samples: 441
Return prediction correlation: 0.0017
Movement prediction accuracy: 0.4104

Mean Reciprocal Rank (MRR) Metrics:
MRR@5: 0.1034
MRR@10: 0.0887
MRR@20: 0.0486
MRR@30: 0.0348
MRR@100: 0.0174

Information Ratio (IRR) Metrics:
IRR@5: 1.0446 (Excess Return: 0.0415, Top-5 Avg: 0.0307, Bottom-5 Avg: -0.0109)
IRR@10: 0.4018 (Excess Return: 0.0156, Top-10 Avg: 0.0138, Bottom-10 Avg: -0.0017)
IRR@20: 0.2582 (Excess Return: 0.0101, Top-20 Avg: 0.0085, Bottom-20 Avg: -0.0016)
IRR@30: 0.2040 (Excess Return: 0.0075, Top-30 Avg: 0.0069, Bottom-30 Avg: -0.0006)
IRR@100: -0.0132 (Excess Return: -0.0004, Top-100 Avg: 0.0030, Bottom-100 Avg: 0.0034)

--- Evaluation Complete ---


In [35]:
def generate_predictions_csv_with_names(model):
    print("Generating predictions for each company across all dates...")
    
    # Create a mapping from company_id to company name
    # Assuming the DataFrame has columns for company ID and name (like 'symbol' or 'company_name')
    # If your DataFrame columns are different, adjust accordingly
    company_id_to_name = {}
    
    # Choose one of these approaches based on your DataFrame structure:
    
    # Option 1: If company identifiers are in the index and names are in a column
    if 'company_name' in df_val_date.columns:
        for idx, row in df_val_date.reset_index().drop_duplicates(['company_id']).iterrows():
            company_id_to_name[row['company_id']] = row['company_name']
    
    # Option 2: If there's a 'symbol' or 'ticker' column that has readable names
    elif 'symbol' in df_val_date.columns:
        for idx, row in df_val_date.reset_index().drop_duplicates(['company_id']).iterrows():
            company_id_to_name[row['company_id']] = row['symbol']
    
    # Default: If no name column is found, use the IDs but convert to string
    if not company_id_to_name:
        print("Warning: Could not find company names in the dataset. Using IDs.")
        # Will use str(company_id) as fallback
    
    predictions_data = {
        'date': [],
        'company_id': [],
        'company_name': [],  # New column for company names
        'actual_return': [],
        'predicted_return': [],
        'movement_actual': [],
        'movement_predicted': []
    }
    
    if val_weekly_long_term_embeddings:
        model.eval()
        with torch.no_grad():
            for week_idx in sorted(val_weekly_long_term_embeddings.keys()):
                try:
                    dates = df_val_date.index.unique()
                    if week_idx < len(dates):
                        current_date = dates[week_idx]
                    else:
                        current_date = pd.Timestamp(end_date)
                except:
                    current_date = pd.Timestamp(start_date) + pd.Timedelta(days=7*week_idx)
                
                for company_idx, company_data in val_weekly_long_term_embeddings[week_idx].items():
                    if (company_idx not in stock_sector_map or 
                        len(stock_sector_map[company_idx]) == 0):
                        continue
                    
                    sector_id = list(stock_sector_map[company_idx])[0]
                    if sector_id not in val_weekly_inter_sector_embeddings[week_idx]:
                        continue
                    
                    attentive_emb = torch.tensor(company_data['attentive'], dtype=torch.float32).unsqueeze(0)
                    graph_emb = torch.tensor(company_data['graph'], dtype=torch.float32).unsqueeze(0)
                    sector_emb = val_weekly_inter_sector_embeddings[week_idx][sector_id].unsqueeze(0)
                    
                    if company_idx in val_stock_returns_map and len(val_stock_returns_map[company_idx]) > week_idx:
                        actual_return = val_stock_returns_map[company_idx][week_idx]
                        actual_movement = 1.0 if actual_return > 0 else 0.0
                        
                        return_pred, movement_pred = model(attentive_emb, graph_emb, sector_emb)
                        
                        # Get company name from mapping, or use ID as string if not found
                        company_name = company_id_to_name.get(company_idx, f"Company_{company_idx}")
                        
                        predictions_data['date'].append(current_date)
                        predictions_data['company_id'].append(company_idx)
                        predictions_data['company_name'].append(company_name)
                        predictions_data['actual_return'].append(actual_return)
                        predictions_data['predicted_return'].append(return_pred.item())
                        predictions_data['movement_actual'].append(actual_movement)
                        predictions_data['movement_predicted'].append(movement_pred.item())
    
    if predictions_data['date']:
        pred_df = pd.DataFrame(predictions_data)
        
        # Convert company_id to string for better readability
        pred_df['company_id'] = pred_df['company_id'].astype(str)
        
        output_file = 'stock_return_predictions.csv'
        pred_df.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
        
        print("\nSample predictions:")
        print(pred_df.head())
        
        return pred_df
    else:
        print("No prediction data available to save")
        return None

In [37]:
predictions_df = generate_predictions_csv_with_names(model)

if predictions_df is not None:
    print(f"\nTotal predictions generated: {len(predictions_df)}")
    print(f"Number of unique companies: {predictions_df['company_id'].nunique()}")
    print(f"Date range: {predictions_df['date'].min()} to {predictions_df['date'].max()}")


Generating predictions for each company across all dates...
Predictions saved to stock_return_predictions.csv

Sample predictions:
                       date company_id company_name  actual_return  \
0 2024-01-05 00:00:00+05:30        131  Company_131      -0.018340   
1 2024-01-05 00:00:00+05:30        136  Company_136      -0.035540   
2 2024-01-05 00:00:00+05:30        392  Company_392      -0.004899   
3 2024-01-05 00:00:00+05:30        270  Company_270       0.012167   
4 2024-01-05 00:00:00+05:30        400  Company_400       0.026711   

   predicted_return  movement_actual  movement_predicted  
0         -0.156015              0.0            0.488052  
1         -0.155235              0.0            0.491528  
2         -0.155392              0.0            0.491395  
3         -0.156076              1.0            0.488107  
4         -0.156069              1.0            0.488050  

Total predictions generated: 20727
Number of unique companies: 441
Date range: 2024-01-05 00: