In [53]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data
import torch_geometric.nn as pyg_nn
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import os
from torch_geometric.nn import GATConv
import torch.nn.functional as F
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Subset
from collections import defaultdict

In [54]:
from models import HierarchicalStockDataset, TransformerSequentialLearner, IntraSectorGAT, LongTermTransformerLearner, EmbeddingFusion, FinGAT, MultiTaskLoss


In [55]:
HIDDEN_SIZE = 16

In [56]:
model_path = 'fingat_tranformer_model.pth'


model = FinGAT(
    attentive_dim=HIDDEN_SIZE,
    graph_dim=HIDDEN_SIZE,
    sector_dim=HIDDEN_SIZE
)

model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [57]:
df = pd.read_parquet('stock_data/processed/merged_stock_data_with_enhanced_features.parquet')
df.head()

df.index = pd.to_datetime(df.index)  

start_date = "2025-01-01"
end_date = "2025-03-10"


df_val_date  = df.loc[start_date:end_date]
print(df_val_date.shape)


(49, 10680)


In [59]:
# Create the validation dataset
val_dataset = HierarchicalStockDataset(df_val_date)
print(f"Validation dataset contains {val_dataset.__len__()} samples")

# Process validation data similarly to training data
val_company_sequences = defaultdict(list)
sector_stock_map = defaultdict(set)
stock_sector_map = defaultdict(set)

# Organize validation samples by company
for idx in range(len(val_dataset)):
    features, industry_id, company_id, return_ratio, movements = val_dataset[idx]
    company_id = company_id.item()
    industry_id = industry_id.item()
    
    val_company_sequences[company_id].append({
        'features': features,
        'industry_id': industry_id,
        'return_ratio': return_ratio.item(),
        'movements': movements.item(),
        'idx': idx
    })
    sector_stock_map[industry_id].add(company_id)
    stock_sector_map[company_id].add(industry_id)

# Process validation embeddings
val_attentive_embeddings = {}
val_stock_returns_map = defaultdict(list)
val_stock_movements_map = defaultdict(list)

# Create attentive embeddings for validation data using TransformerSequentialLearner
stsl_model = TransformerSequentialLearner(
    input_size=val_dataset[0][0].shape[1],
    hidden_size=HIDDEN_SIZE
)
stsl_model.eval()

with torch.no_grad():
    for company_id, sequences in val_company_sequences.items():
        # Sort by original index to maintain temporal ordering
        sequences.sort(key=lambda x: x['idx'])
        
        # Process all sequences for this company in order
        company_attentive_embeddings = []
        
        for seq in sequences:
            features = seq['features'].unsqueeze(0)
            industry_id = seq['industry_id']
            return_ratio = seq['return_ratio']
            movements = seq['movements']
            
            # Store return and movement information
            val_stock_returns_map[company_id].append(return_ratio)
            val_stock_movements_map[company_id].append(movements)
            
            # Get embeddings using the sequential model
            context, _ = stsl_model(features)
            company_attentive_embeddings.append(context.squeeze(0).cpu().numpy())
        
        # Store the entire temporal sequence for this company
        if company_attentive_embeddings:  # Only store if we have data
            val_attentive_embeddings[company_id] = np.stack(company_attentive_embeddings, axis=0)
print(f"Validation attentive embeddings processed: {len(val_attentive_embeddings)} companies")

# Add enhanced sector pooling function
def enhanced_sector_pooling(graph_data):
    x = graph_data.x
    sector_id = graph_data.sector_id
    

    torch.manual_seed(sector_id * 100)

    if sector_id % 4 == 0:  
 
        combined_pool = 0.5 * torch.max(x, dim=0)[0] + 0.2 * torch.mean(x, dim=0) + 0.3 * torch.std(x, dim=0)
    elif sector_id % 4 == 1:

        combined_pool = 0.2 * torch.max(x, dim=0)[0] + 0.5 * torch.min(x, dim=0)[0] + 0.3 * torch.quantile(x, 0.25, dim=0)
    elif sector_id % 4 == 2: 

        combined_pool = 0.3 * torch.mean(x, dim=0) + 0.7 * torch.std(x, dim=0)
    else:  

        combined_pool = 0.4 * torch.quantile(x, 0.5, dim=0) + 0.3 * torch.quantile(x, 0.75, dim=0) + 0.3 * torch.quantile(x, 0.25, dim=0)

    sector_factor = ((sector_id % 7) + 1) / 4.0  
    if sector_id % 3 == 0:

        combined_pool = torch.tanh(sector_factor * combined_pool)
    elif sector_id % 3 == 1:

        combined_pool = F.leaky_relu(sector_factor * combined_pool, negative_slope=0.1)
    else:

        combined_pool = torch.clamp(torch.exp(sector_factor * combined_pool * 0.1) - 1, -5, 5)

    dim = combined_pool.size(0)

    noise_scaling = 0.5 + (sector_id % 10) * 0.1 
    orthogonal_noise = torch.zeros_like(combined_pool)

    for i in range(dim):
        phase = (sector_id * 0.1) + (i * 0.3 * (1 + sector_id % 5))
        orthogonal_noise[i] = torch.sin(torch.tensor(phase * math.pi))

    noise = (torch.randn_like(combined_pool) * 0.4) + (orthogonal_noise * noise_scaling)
    combined_pool = combined_pool + noise

    if sector_id % 2 == 0:

        norm = torch.norm(combined_pool)
        if norm > 0:
            combined_pool = combined_pool / norm
    
    return combined_pool
# Function to evaluate on validation data
def evaluate_on_validation(trained_model):
    """
    Evaluate the TransformerFinGAT model on validation data by creating new embeddings from validation data
    """
    print("Creating validation embeddings from scratch...")
    global val_weekly_long_term_embeddings, val_weekly_inter_sector_embeddings
    
    val_weekly_intra_sector_graphs = defaultdict(lambda: defaultdict(list))
    
    # Find maximum weeks in validation data
    max_val_weeks = max([len(emb) for emb in val_attentive_embeddings.values()]) if val_attentive_embeddings else 0
    
    # Process each timepoint (week) separately
    for week_idx in range(max_val_weeks):
        print(f"Processing validation week {week_idx}")
        
        # Group by sector
        for sector_id, sector_stocks in sector_stock_map.items():
            sector_features = []
            valid_indices = []
            
            # Collect stock embeddings in this sector-week
            for stock_idx in sector_stocks:
                if stock_idx in val_attentive_embeddings and week_idx < len(val_attentive_embeddings[stock_idx]):
                    tensor_embedding = torch.tensor(val_attentive_embeddings[stock_idx][week_idx], dtype=torch.float32)
                    sector_features.append(tensor_embedding)
                    valid_indices.append(stock_idx)
            
            # Create graph if >= 2 stocks
            if len(sector_features) >= 2:
                edge_index = []
                num_nodes = len(valid_indices)
                
                # Fully-connected edges without self-loops
                for i in range(num_nodes):
                    for j in range(num_nodes):
                        if i != j:
                            edge_index.append([i, j])
                
                # Store the graph with additional sector_id attribute
                val_weekly_intra_sector_graphs[week_idx][sector_id] = Data(
                    x=torch.stack(sector_features),
                    edge_index=torch.tensor(edge_index).t().contiguous(),
                    original_indices=valid_indices,
                    sector_id=sector_id
                )
    
    # STEP 2: Process validation graphs through IntraSectorGAT
    val_sector_embeddings = []
    gat_model = IntraSectorGAT(HIDDEN_SIZE=HIDDEN_SIZE)  # Reuse the model architecture
    gat_model.eval()
    
    with torch.no_grad():
        for week_idx in val_weekly_intra_sector_graphs:
            for sector_id, graph in val_weekly_intra_sector_graphs[week_idx].items():
                graph = graph
                out = gat_model(graph)
                val_sector_embeddings.append({
                    'embeddings': out.cpu(),
                    'original_indices': graph.original_indices,
                    'sector_id': graph.sector_id,
                    'week_idx': week_idx
                })
    
    # STEP 3: Create pooled embeddings for sectors using enhanced pooling
    val_weekly_sector_pooled_embeddings = defaultdict(dict)
    
    if val_sector_embeddings:
        embedding_dim = val_sector_embeddings[0]['embeddings'].shape[1]
        
        for week_idx in sorted(val_weekly_intra_sector_graphs.keys()):
            for sector_id in val_weekly_intra_sector_graphs[week_idx]:
                graph = val_weekly_intra_sector_graphs[week_idx][sector_id]
                pooled_embedding = enhanced_sector_pooling(graph)
                val_weekly_sector_pooled_embeddings[week_idx][sector_id] = pooled_embedding
    
    # STEP 4: Create sector embeddings using the direct approach instead of InterSectorGAT
    val_weekly_inter_sector_embeddings = defaultdict(dict)
    
    if val_sector_embeddings:
        # Get number of sectors and create the embedding layer
        num_sectors = len(sector_stock_map)
        sector_embedding = nn.Embedding(num_sectors + 10, HIDDEN_SIZE)
        
        # Add a projection layer to get the desired output dimension
        projection = nn.Sequential(
            nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE * 2),
            nn.LayerNorm(HIDDEN_SIZE * 2),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE * 2, HIDDEN_SIZE)
        )
        
        # Use the same set of weeks as in the pooled embeddings
        for week_idx in sorted(val_weekly_sector_pooled_embeddings.keys()):
            sectors_this_week = val_weekly_sector_pooled_embeddings[week_idx]
            
            # Process each sector
            for sector_id in sectors_this_week.keys():
                # Use direct sector ID for embedding lookup
                sector_tensor = torch.tensor([sector_id])
                
                # Get the sector embedding and apply projection
                with torch.no_grad():
                    embedding = sector_embedding(sector_tensor)
                    embedding = projection(embedding)
                    
                # Store the sector embedding for this week
                val_weekly_inter_sector_embeddings[week_idx][sector_id] = embedding.squeeze(0).cpu()
    
    # STEP 5: Organize company sector embeddings
    val_company_sector_embeddings = defaultdict(lambda: defaultdict(list))
    
    for sector_data in val_sector_embeddings:
        week_idx = sector_data['week_idx']
        original_indices = sector_data['original_indices']
        embeddings = sector_data['embeddings']
        
        for i, company_idx in enumerate(original_indices):
            val_company_sector_embeddings[company_idx][week_idx] = embeddings[i]
    
    # STEP 6: Process long-term embeddings using transformer learners
    val_weekly_long_term_embeddings = defaultdict(lambda: defaultdict(dict))
    SEQ_LENGTH = 4  # Same as training
    
    if val_sector_embeddings:
        # Initialize long term transformer learners
        long_term_g = LongTermTransformerLearner(
            input_size=val_sector_embeddings[0]['embeddings'].shape[1],
            hidden_size=HIDDEN_SIZE,
            lookback_weeks=SEQ_LENGTH
        )
        
        long_term_a = LongTermTransformerLearner(
            input_size=next(iter(val_attentive_embeddings.values())).shape[1],
            hidden_size=HIDDEN_SIZE,
            lookback_weeks=SEQ_LENGTH
        )
        
        with torch.no_grad():
            # Get the maximum week index
            if val_company_sector_embeddings:
                max_weeks = max([max(weekly_data.keys()) for weekly_data in val_company_sector_embeddings.values()])
                
                # Process each week from SEQ_LENGTH onwards
                for current_week in range(SEQ_LENGTH, max_weeks + 1):
                    print(f"Processing validation long-term embeddings for week {current_week}")
                    # Process each company for this week
                    for company_idx, weekly_data in val_company_sector_embeddings.items():
                        # Check if we have data for this company in this week
                        if current_week not in weekly_data:
                            continue
                            
                        # Check if we have enough history
                        history_weeks = [w for w in range(current_week - SEQ_LENGTH, current_week)]
                        if not all(w in weekly_data for w in history_weeks):
                            continue
                            
                        # Gather the sliding window of embeddings
                        graph_window = []
                        for w in history_weeks:
                            if w in weekly_data:
                                graph_emb = torch.tensor(weekly_data[w]).unsqueeze(0)
                                graph_window.append(graph_emb)
                        
                        # Process attentive embeddings if available
                        if company_idx in val_attentive_embeddings:
                            attentive_seqs = val_attentive_embeddings[company_idx]
                            
                            if len(attentive_seqs) >= SEQ_LENGTH:
                                attentive_window = []
                                for i in range(SEQ_LENGTH):
                                    seq_idx = len(attentive_seqs) - SEQ_LENGTH + i
                                    if seq_idx >= 0 and seq_idx < len(attentive_seqs):
                                        att_emb = torch.tensor(attentive_seqs[seq_idx], dtype=torch.float32).unsqueeze(0)
                                        attentive_window.append(att_emb)
                                
                                if len(graph_window) == SEQ_LENGTH and len(attentive_window) == SEQ_LENGTH:
                                    tau_G = long_term_g(graph_window)
                                    tau_A = long_term_a(attentive_window)
                                    
                                    # Store the results
                                    val_weekly_long_term_embeddings[current_week][company_idx] = {
                                        'graph': tau_G.cpu().numpy(),
                                        'attentive': tau_A.cpu().numpy()
                                    }
    
    # STEP 7: Create final evaluation data using the latest week
    val_data = {
        'attentive_embs': [],
        'graph_embs': [],
        'sector_embs': [],
        'returns': [],
        'movements': [],
        'companies': []
    }
    
    # Use the last week for prediction
    if val_weekly_long_term_embeddings:
        last_week = max(val_weekly_long_term_embeddings.keys())
        print(f"Using week {last_week} for final evaluation")
        
        for company_idx, company_data in val_weekly_long_term_embeddings[last_week].items():
            # Get the stock's sector
            if company_idx not in stock_sector_map or len(stock_sector_map[company_idx]) == 0:
                continue
                
            sector_id = list(stock_sector_map[company_idx])[0]
            
            # Check if we have the sector embedding for this week
            if sector_id not in val_weekly_inter_sector_embeddings[last_week]:
                continue
                
            # Extract embeddings
            attentive_emb = torch.tensor(company_data['attentive'], dtype=torch.float32)
            graph_emb = torch.tensor(company_data['graph'], dtype=torch.float32)
            sector_emb = val_weekly_inter_sector_embeddings[last_week][sector_id]
            
            # Get return and movement labels
            if company_idx in val_stock_returns_map and val_stock_returns_map[company_idx]:
                return_ratio = val_stock_returns_map[company_idx][0]
                movement = 1.0 if return_ratio > 0 else 0.0
                
                # Add to evaluation data
                val_data['attentive_embs'].append(attentive_emb)
                val_data['graph_embs'].append(graph_emb)
                val_data['sector_embs'].append(sector_emb)
                val_data['companies'].append(company_idx)
                val_data['returns'].append(return_ratio)
                val_data['movements'].append(movement)
    
    # Perform evaluation if we have data
    if val_data['returns']:
        val_data['attentive_embs'] = torch.stack(val_data['attentive_embs'])
        val_data['graph_embs'] = torch.stack(val_data['graph_embs'])
        val_data['sector_embs'] = torch.stack(val_data['sector_embs'])
        val_data['returns'] = torch.tensor(val_data['returns'], dtype=torch.float32)
        val_data['movements'] = torch.tensor(val_data['movements'], dtype=torch.float32)
        
        # Evaluate
        trained_model.eval()
        with torch.no_grad():
            # Move to device
            attentive_embs = val_data['attentive_embs']
            graph_embs = val_data['graph_embs']
            sector_embs = val_data['sector_embs']
            
            # Get predictions - using the transformer model
            return_preds, movement_preds = trained_model(attentive_embs, graph_embs, sector_embs)
            
            # Calculate metrics
            return_preds_np = return_preds.cpu().numpy()
            return_targets_np = val_data['returns'].numpy()
            movement_preds_np = movement_preds.cpu().numpy()
            movement_targets_np = val_data['movements'].numpy()
            
            # Correlation for returns
            from scipy.stats import spearmanr
            corr, _ = spearmanr(return_preds_np, return_targets_np)
            
            # Accuracy for movement prediction
            threshold = 0.5
            binary_preds = [1 if p > threshold else 0 for p in movement_preds_np]
            accuracy = sum(p == t for p, t in zip(binary_preds, movement_targets_np)) / len(binary_preds)
            

            
            # Calculate MRR metrics
            pred_target_pairs = list(zip(return_preds_np, return_targets_np, val_data['companies']))
            pred_ranking = sorted(pred_target_pairs, key=lambda x: x[0], reverse=True)
            company_to_pred_rank = {company: i+1 for i, (_, _, company) in enumerate(pred_ranking)}
            
            true_ranking = sorted(
                [(target, company) for (_, target, company) in pred_target_pairs],
                key=lambda x: x[0], 
                reverse=True
            )
            
            mrr_values = {}
            top_k_values = [5, 10, 20, 30, 100]
            
            for k in top_k_values:
                mrr_at_k = []
                for j, (_, company) in enumerate(true_ranking[:min(k, len(true_ranking))]):
                    if company in company_to_pred_rank:
                        mrr_at_k.append(1.0 / company_to_pred_rank[company])
                
                mrr_values[k] = sum(mrr_at_k) / len(mrr_at_k) if mrr_at_k else 0
            
            # Print results
            print(f"\nValidation Metrics (using transformer model):")
            print(f"Number of validation samples: {len(return_preds_np)}")
            print(f"Return prediction correlation: {corr:.4f}")
            print(f"Movement prediction accuracy: {accuracy:.4f}")

            
            print("\nMean Reciprocal Rank (MRR) Metrics:")
            for k in top_k_values:
                print(f"MRR@{k}: {mrr_values[k]:.4f}")
            
            # Calculate IRR metrics
            irr_values = {}
            
            for k in top_k_values:
                # Sort predictions and get top and bottom k stocks
                pred_sorted_indices = np.argsort(return_preds_np)[::-1]
                
                # Get actual returns for top k and bottom k predicted stocks
                if len(pred_sorted_indices) >= k*2:
                    top_k_returns = [return_targets_np[i] for i in pred_sorted_indices[:k]]
                    bottom_k_returns = [return_targets_np[i] for i in pred_sorted_indices[-k:]]
                    
                    # Calculate metrics
                    top_k_mean = np.mean(top_k_returns)
                    bottom_k_mean = np.mean(bottom_k_returns)
                    excess_return = top_k_mean - bottom_k_mean
                    
                    # Calculate tracking error as standard deviation of return differences
                    if len(top_k_returns) > 1:
                        tracking_error = np.std(np.array(top_k_returns) - np.array(bottom_k_returns))
                        ir = excess_return / tracking_error if tracking_error > 0 else 0
                    else:
                        ir = 0
                        
                    irr_values[k] = {
                        'ir': ir,
                        'top_returns': top_k_mean,
                        'bottom_returns': bottom_k_mean,
                        'excess_return': excess_return
                    }
            
            # Print IRR metrics
            print("\nInformation Ratio (IRR) Metrics:")
            for k in top_k_values:
                if k in irr_values:
                    print(f"IRR@{k}: {irr_values[k]['ir']:.4f} (Excess Return: {irr_values[k]['excess_return']:.4f}, "
                          f"Top-{k} Avg: {irr_values[k]['top_returns']:.4f}, Bottom-{k} Avg: {irr_values[k]['bottom_returns']:.4f})")
                    
            return corr, accuracy, mrr_values
    else:
        print("No validation data available for evaluation")
        return None, None, None, None

print("\n--- Evaluating on Validation Data with Fresh Embeddings ---")
val_metrics = evaluate_on_validation(model)
print("\n--- Evaluation Complete ---")


Validation dataset contains 19135 samples
Validation attentive embeddings processed: 445 companies

--- Evaluating on Validation Data with Fresh Embeddings ---
Creating validation embeddings from scratch...
Processing validation week 0
Processing validation week 1
Processing validation week 2
Processing validation week 3
Processing validation week 4
Processing validation week 5
Processing validation week 6
Processing validation week 7
Processing validation week 8
Processing validation week 9
Processing validation week 10
Processing validation week 11
Processing validation week 12
Processing validation week 13
Processing validation week 14
Processing validation week 15
Processing validation week 16
Processing validation week 17
Processing validation week 18
Processing validation week 19
Processing validation week 20
Processing validation week 21
Processing validation week 22
Processing validation week 23
Processing validation week 24
Processing validation week 25
Processing validation w

  graph_emb = torch.tensor(weekly_data[w]).unsqueeze(0)


Processing validation long-term embeddings for week 5
Processing validation long-term embeddings for week 6
Processing validation long-term embeddings for week 7
Processing validation long-term embeddings for week 8
Processing validation long-term embeddings for week 9
Processing validation long-term embeddings for week 10
Processing validation long-term embeddings for week 11
Processing validation long-term embeddings for week 12
Processing validation long-term embeddings for week 13
Processing validation long-term embeddings for week 14
Processing validation long-term embeddings for week 15
Processing validation long-term embeddings for week 16
Processing validation long-term embeddings for week 17
Processing validation long-term embeddings for week 18
Processing validation long-term embeddings for week 19
Processing validation long-term embeddings for week 20
Processing validation long-term embeddings for week 21
Processing validation long-term embeddings for week 22
Processing vali

In [None]:
def generate_predictions_csv_with_names(model):
    print("Generating predictions for each company across all dates...")
    
    # Create a mapping from company_id to company name
    # Assuming the DataFrame has columns for company ID and name (like 'symbol' or 'company_name')
    # If your DataFrame columns are different, adjust accordingly
    company_id_to_name = {}
    
    # Choose one of these approaches based on your DataFrame structure:
    
    # Option 1: If company identifiers are in the index and names are in a column
    if 'company_name' in df_val_date.columns:
        for idx, row in df_val_date.reset_index().drop_duplicates(['company_id']).iterrows():
            company_id_to_name[row['company_id']] = row['company_name']
    
    # Option 2: If there's a 'symbol' or 'ticker' column that has readable names
    elif 'symbol' in df_val_date.columns:
        for idx, row in df_val_date.reset_index().drop_duplicates(['company_id']).iterrows():
            company_id_to_name[row['company_id']] = row['symbol']
    
    # Default: If no name column is found, use the IDs but convert to string
    if not company_id_to_name:
        print("Warning: Could not find company names in the dataset. Using IDs.")
        # Will use str(company_id) as fallback
    
    predictions_data = {
        'date': [],
        'company_id': [],
        'company_name': [],  # New column for company names
        'actual_return': [],
        'predicted_return': [],
        'movement_actual': [],
        'movement_predicted': []
    }
    
    if val_weekly_long_term_embeddings:
        model.eval()
        with torch.no_grad():
            for week_idx in sorted(val_weekly_long_term_embeddings.keys()):
                try:
                    dates = df_val_date.index.unique()
                    if week_idx < len(dates):
                        current_date = dates[week_idx]
                    else:
                        current_date = pd.Timestamp(end_date)
                except:
                    current_date = pd.Timestamp(start_date) + pd.Timedelta(days=7*week_idx)
                
                for company_idx, company_data in val_weekly_long_term_embeddings[week_idx].items():
                    if (company_idx not in stock_sector_map or 
                        len(stock_sector_map[company_idx]) == 0):
                        continue
                    
                    sector_id = list(stock_sector_map[company_idx])[0]
                    if sector_id not in val_weekly_inter_sector_embeddings[week_idx]:
                        continue
                    
                    attentive_emb = torch.tensor(company_data['attentive'], dtype=torch.float32).unsqueeze(0)
                    graph_emb = torch.tensor(company_data['graph'], dtype=torch.float32).unsqueeze(0)
                    sector_emb = val_weekly_inter_sector_embeddings[week_idx][sector_id].unsqueeze(0)
                    
                    if company_idx in val_stock_returns_map and len(val_stock_returns_map[company_idx]) > week_idx:
                        actual_return = val_stock_returns_map[company_idx][week_idx]
                        actual_movement = 1.0 if actual_return > 0 else 0.0
                        
                        return_pred, movement_pred = model(attentive_emb, graph_emb, sector_emb)
                        
                        # Get company name from mapping, or use ID as string if not found
                        company_name = company_id_to_name.get(company_idx, f"Company_{company_idx}")
                        
                        predictions_data['date'].append(current_date)
                        predictions_data['company_id'].append(company_idx)
                        predictions_data['company_name'].append(company_name)
                        predictions_data['actual_return'].append(actual_return)
                        predictions_data['predicted_return'].append(return_pred.item())
                        predictions_data['movement_actual'].append(actual_movement)
                        predictions_data['movement_predicted'].append(movement_pred.item())
    
    if predictions_data['date']:
        pred_df = pd.DataFrame(predictions_data)
        
        # Convert company_id to string for better readability
        pred_df['company_id'] = pred_df['company_id'].astype(str)
        
        output_file = 'stock_return_predictions.csv'
        pred_df.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}")
        
        print("\nSample predictions:")
        print(pred_df.head())
        
        return pred_df
    else:
        print("No prediction data available to save")
        return None

In [None]:
predictions_df = generate_predictions_csv_with_names(model)

if predictions_df is not None:
    print(f"\nTotal predictions generated: {len(predictions_df)}")
    print(f"Number of unique companies: {predictions_df['company_id'].nunique()}")
    print(f"Date range: {predictions_df['date'].min()} to {predictions_df['date'].max()}")


Generating predictions for each company across all dates...
Predictions saved to stock_return_predictions.csv

Sample predictions:
                       date company_id company_name  actual_return  \
0 2025-01-07 00:00:00+05:30        131  Company_131       0.023595   
1 2025-01-07 00:00:00+05:30        136  Company_136      -0.000654   
2 2025-01-07 00:00:00+05:30        266  Company_266      -0.029267   
3 2025-01-07 00:00:00+05:30        396  Company_396      -0.008955   
4 2025-01-07 00:00:00+05:30        274  Company_274       0.017566   

   predicted_return  movement_actual  movement_predicted  
0         -0.608756              1.0            0.505033  
1         -0.601035              0.0            0.505234  
2         -0.597668              0.0            0.505858  
3         -0.601886              0.0            0.504720  
4         -0.603696              1.0            0.504827  

Total predictions generated: 17355
Number of unique companies: 445
Date range: 2025-01-07 00: