In [1]:
# Install and import graph embedding libraries
try:
    import node2vec
    print("node2vec is already installed")
except ImportError:
    print("Installing node2vec and dependencies...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'node2vec', 'networkx', 'gensim', '-q'])
    print("Installation complete!")
    import node2vec

import networkx as nx
from node2vec import Node2Vec

print("Libraries loaded successfully")

node2vec is already installed
Libraries loaded successfully


In [2]:
import os
import numpy as np
import pandas as pd
import pywt  # PyWavelets for DWT
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
BASE_DIR = Path('/home/ubuntu/rajnish/Multitask-Stockformer')
FACTOR_DIR = BASE_DIR / 'data/NIFTY200/Alpha_158_2022-01-01_2024-08-31'
LABEL_FILE = BASE_DIR / 'data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/label.csv'
OUTPUT_DIR = BASE_DIR / 'data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31'

# Wavelet parameters (matching original paper)
WAVELET_TYPE = 'sym2'  # Symlet 2
WAVELET_LEVEL = 1      # Single-level decomposition

# Correlation threshold for adjacency matrix
CORR_THRESHOLD = 0.3

print(f"Base directory: {BASE_DIR}")
print(f"Factor directory: {FACTOR_DIR}")
print(f"Label file: {LABEL_FILE}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nWavelet: {WAVELET_TYPE}, Level: {WAVELET_LEVEL}")
print(f"Correlation threshold: {CORR_THRESHOLD}")

Base directory: /home/ubuntu/rajnish/Multitask-Stockformer
Factor directory: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Alpha_158_2022-01-01_2024-08-31
Label file: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/label.csv
Output directory: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31

Wavelet: sym2, Level: 1
Correlation threshold: 0.3


## 1. Setup & Environment

# Phase 4: Stockformer Input Preprocessing

Transform Phase 3 outputs (22 Alpha158 factors) into Stockformer model input files:
1. `flow.npz` - Wavelet-decomposed factors
2. `trend_indicator.npz` - Binary trend labels
3. `corr_adj.npy` - Correlation adjacency matrix
4. `128_corr_struc2vec_adjgat.npy` - Graph embeddings (MANDATORY)
5. `label.csv` - Already exists from Phase 3

## 2. Load Factor Data

In [3]:
# Load selected factor names
factor_list_file = FACTOR_DIR / 'selected_factors.txt'
with open(factor_list_file, 'r') as f:
    selected_factors = [line.strip() for line in f.readlines()]

print(f"Number of factors: {len(selected_factors)}")
print(f"\nFactor list:")
for i, factor in enumerate(selected_factors, 1):
    print(f"{i:2d}. {factor}")

Number of factors: 22

Factor list:
 1. STD20
 2. KLEN
 3. BETA60
 4. BETA20
 5. HIGH0
 6. STD10
 7. MIN60
 8. KUP
 9. RESI20
10. QTLD60
11. BETA30
12. STD30
13. IMIN30
14. MA60
15. MAX5
16. MIN30
17. MIN20
18. CORD30
19. ROC30
20. CORR60
21. IMXD30
22. STD5


In [4]:
# Load all factor CSVs into a 3D structure: [time, stock, factor]
print("Loading factor CSVs...")

factor_dfs = {}
for factor_name in selected_factors:
    csv_path = FACTOR_DIR / f"{factor_name}.csv"
    if csv_path.exists():
        df = pd.read_csv(csv_path, index_col=0)  # index is date
        factor_dfs[factor_name] = df
    else:
        print(f"Warning: {csv_path} not found")

print(f"\nLoaded {len(factor_dfs)} factor files")

# Get dimensions
first_factor = list(factor_dfs.values())[0]
num_dates = len(first_factor)
num_stocks = len(first_factor.columns)
num_factors = len(factor_dfs)

print(f"\nData dimensions:")
print(f"  Time steps: {num_dates}")
print(f"  Stocks: {num_stocks}")
print(f"  Factors: {num_factors}")
print(f"\nDate range: {first_factor.index[0]} to {first_factor.index[-1]}")
print(f"Sample stocks: {list(first_factor.columns[:5])}")

Loading factor CSVs...

Loaded 22 factor files

Data dimensions:
  Time steps: 660
  Stocks: 191
  Factors: 22

Date range: 2022-01-03 to 2024-08-30
Sample stocks: ['360ONE', 'ABB', 'ABCAPITAL', 'ACC', 'ADANIENSOL']


## 3. Create 3D Factor Tensor

Convert dictionary of DataFrames to numpy array: `[time, stock, factor]`

In [5]:
# Create 3D tensor: [time, stock, factor]
factor_tensor = np.zeros((num_dates, num_stocks, num_factors))

for factor_idx, factor_name in enumerate(selected_factors):
    if factor_name in factor_dfs:
        factor_tensor[:, :, factor_idx] = factor_dfs[factor_name].values

print(f"Factor tensor shape: {factor_tensor.shape}")
print(f"  [time={num_dates}, stock={num_stocks}, factor={num_factors}]")
print(f"\nTensor statistics:")
print(f"  Min: {factor_tensor.min():.6f}")
print(f"  Max: {factor_tensor.max():.6f}")
print(f"  Mean: {factor_tensor.mean():.6f}")
print(f"  Std: {factor_tensor.std():.6f}")
print(f"  NaN count: {np.isnan(factor_tensor).sum()}")
print(f"  Inf count: {np.isinf(factor_tensor).sum()}")

Factor tensor shape: (660, 191, 22)
  [time=660, stock=191, factor=22]

Tensor statistics:
  Min: nan
  Max: nan
  Mean: nan
  Std: nan
  NaN count: 201078
  Inf count: 0


## 4. Wavelet Decomposition (DWT)

Apply single-level Discrete Wavelet Transform (sym2) to decompose each factor time series into:
- **Approximation coefficients (cA):** Low-frequency component (trend)
- **Detail coefficients (cD):** High-frequency component (noise/volatility)

Original paper uses this to separate slow-changing trends from rapid fluctuations.

In [6]:
# Apply DWT to each stock-factor time series
print(f"Applying {WAVELET_TYPE} wavelet decomposition (level={WAVELET_LEVEL})...")
print("Processing each stock-factor combination...\n")

# Initialize arrays for low and high frequency components
# After decomposition, length will be approximately half
approx_length = pywt.dwt_coeff_len(num_dates, pywt.Wavelet(WAVELET_TYPE).dec_len, mode='smooth')

flow_low_freq = np.zeros((approx_length, num_stocks, num_factors))
flow_high_freq = np.zeros((approx_length, num_stocks, num_factors))

# Apply DWT to each time series
for stock_idx in range(num_stocks):
    if (stock_idx + 1) % 50 == 0:
        print(f"  Processing stock {stock_idx + 1}/{num_stocks}...")
    
    for factor_idx in range(num_factors):
        # Extract time series for this stock-factor combination
        time_series = factor_tensor[:, stock_idx, factor_idx]
        
        # Handle NaN/Inf values
        if np.isnan(time_series).any() or np.isinf(time_series).any():
            time_series = np.nan_to_num(time_series, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Apply DWT
        cA, cD = pywt.dwt(time_series, WAVELET_TYPE, mode='smooth')
        
        # Store coefficients
        flow_low_freq[:, stock_idx, factor_idx] = cA  # Approximation (low freq)
        flow_high_freq[:, stock_idx, factor_idx] = cD  # Detail (high freq)

print(f"\nWavelet decomposition complete!")
print(f"\nLow-frequency component shape: {flow_low_freq.shape}")
print(f"  Min: {flow_low_freq.min():.6f}, Max: {flow_low_freq.max():.6f}")
print(f"  Mean: {flow_low_freq.mean():.6f}, Std: {flow_low_freq.std():.6f}")

print(f"\nHigh-frequency component shape: {flow_high_freq.shape}")
print(f"  Min: {flow_high_freq.min():.6f}, Max: {flow_high_freq.max():.6f}")
print(f"  Mean: {flow_high_freq.mean():.6f}, Std: {flow_high_freq.std():.6f}")

Applying sym2 wavelet decomposition (level=1)...
Processing each stock-factor combination...

  Processing stock 50/191...
  Processing stock 100/191...
  Processing stock 150/191...

Wavelet decomposition complete!

Low-frequency component shape: (331, 191, 22)
  Min: -20.082091, Max: 16.627052
  Mean: 0.000000, Std: 1.312890

High-frequency component shape: (331, 191, 22)
  Min: -6.299598, Max: 8.825853
  Mean: 0.000000, Std: 0.355082


In [7]:
# Save flow.npz
flow_output_path = OUTPUT_DIR / 'flow.npz'

np.savez_compressed(
    flow_output_path,
    low_freq=flow_low_freq,
    high_freq=flow_high_freq
)

print(f"Saved flow.npz to: {flow_output_path}")
print(f"File size: {flow_output_path.stat().st_size / 1024 / 1024:.2f} MB")

# Verify
loaded = np.load(flow_output_path)
print(f"\nVerification:")
print(f"  Keys: {list(loaded.keys())}")
print(f"  low_freq shape: {loaded['low_freq'].shape}")
print(f"  high_freq shape: {loaded['high_freq'].shape}")

Saved flow.npz to: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/flow.npz
File size: 18.95 MB

Verification:
  Keys: ['low_freq', 'high_freq']
  low_freq shape: (331, 191, 22)
  high_freq shape: (331, 191, 22)


## 5. Generate Trend Indicators

Binary classification labels: 1 if return > 0 (up), else 0 (down)

In [8]:
# Load label.csv (daily returns)
print(f"Loading labels from: {LABEL_FILE}")
label_df = pd.read_csv(LABEL_FILE, index_col=0)

print(f"\nLabel DataFrame shape: {label_df.shape}")
print(f"Date range: {label_df.index[0]} to {label_df.index[-1]}")
print(f"Stocks: {len(label_df.columns)}")

# Create binary trend indicators
trend_indicator = (label_df.values > 0).astype(np.int32)

print(f"\nTrend indicator shape: {trend_indicator.shape}")
print(f"  [time={trend_indicator.shape[0]}, stock={trend_indicator.shape[1]}]")
print(f"\nTrend statistics:")
print(f"  Up days (1): {(trend_indicator == 1).sum()} ({(trend_indicator == 1).mean()*100:.2f}%)")
print(f"  Down days (0): {(trend_indicator == 0).sum()} ({(trend_indicator == 0).mean()*100:.2f}%)")

Loading labels from: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/label.csv

Label DataFrame shape: (95, 191)
Date range: 2024-04-15 to 2024-08-30
Stocks: 191

Trend indicator shape: (95, 191)
  [time=95, stock=191]

Trend statistics:
  Up days (1): 9555 (52.66%)
  Down days (0): 8590 (47.34%)


In [9]:
# Save trend_indicator.npz
trend_output_path = OUTPUT_DIR / 'trend_indicator.npz'

np.savez_compressed(
    trend_output_path,
    trend=trend_indicator
)

print(f"Saved trend_indicator.npz to: {trend_output_path}")
print(f"File size: {trend_output_path.stat().st_size / 1024:.2f} KB")

# Verify
loaded_trend = np.load(trend_output_path)
print(f"\nVerification:")
print(f"  Keys: {list(loaded_trend.keys())}")
print(f"  trend shape: {loaded_trend['trend'].shape}")

Saved trend_indicator.npz to: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/trend_indicator.npz
File size: 4.76 KB

Verification:
  Keys: ['trend']
  trend shape: (95, 191)


## 6. Build Correlation Adjacency Matrix

Calculate stock-stock correlation using factor values as features.
Apply threshold to create sparse adjacency matrix for Graph Attention Network.

In [10]:
# Calculate correlation matrix
# Use average factor values over time as features for each stock
print("Calculating stock-stock correlation matrix...")
print(f"Using {num_factors} factors as features\n")

# Average factor values across time for each stock: [stock, factor]
stock_features = np.nanmean(factor_tensor, axis=0)  # [num_stocks, num_factors]

print(f"Stock features shape: {stock_features.shape}")
print(f"  [stock={stock_features.shape[0]}, factor={stock_features.shape[1]}]")

# Calculate correlation matrix between all stock pairs
corr_matrix = np.corrcoef(stock_features)

print(f"\nCorrelation matrix shape: {corr_matrix.shape}")
print(f"Correlation statistics:")
print(f"  Min: {corr_matrix.min():.4f}")
print(f"  Max: {corr_matrix.max():.4f}")
print(f"  Mean: {corr_matrix.mean():.4f}")
print(f"  Median: {np.median(corr_matrix):.4f}")

Calculating stock-stock correlation matrix...
Using 22 factors as features

Stock features shape: (191, 22)
  [stock=191, factor=22]

Correlation matrix shape: (191, 191)
Correlation statistics:
  Min: -1.0000
  Max: 1.0000
  Mean: 0.0057
  Median: 0.0027


In [11]:
# Apply threshold to create sparse adjacency matrix
# Set correlations < threshold to 0 (remove weak edges)
corr_adj = corr_matrix.copy()
corr_adj[np.abs(corr_adj) < CORR_THRESHOLD] = 0

# Set diagonal to 1 (self-loops)
np.fill_diagonal(corr_adj, 1.0)

print(f"\nApplied threshold: |correlation| >= {CORR_THRESHOLD}")
print(f"\nAdjacency matrix statistics:")
print(f"  Non-zero entries: {(corr_adj != 0).sum()}")
print(f"  Sparsity: {(corr_adj == 0).sum() / corr_adj.size * 100:.2f}%")
print(f"  Average edges per node: {(corr_adj != 0).sum() / num_stocks:.2f}")

# Check distribution of correlation strengths
non_diag_corr = corr_adj[~np.eye(corr_adj.shape[0], dtype=bool)]
non_zero_corr = non_diag_corr[non_diag_corr != 0]

if len(non_zero_corr) > 0:
    print(f"\nNon-zero correlation distribution:")
    print(f"  Min: {non_zero_corr.min():.4f}")
    print(f"  Max: {non_zero_corr.max():.4f}")
    print(f"  Mean: {non_zero_corr.mean():.4f}")
    print(f"  Median: {np.median(non_zero_corr):.4f}")


Applied threshold: |correlation| >= 0.3

Adjacency matrix statistics:
  Non-zero entries: 27735
  Sparsity: 23.97%
  Average edges per node: 145.21

Non-zero correlation distribution:
  Min: -1.0000
  Max: 0.9959
  Mean: 0.0004
  Median: -0.3008


In [12]:
# Save correlation adjacency matrix
corr_adj_path = OUTPUT_DIR / 'corr_adj.npy'

np.save(corr_adj_path, corr_adj)

print(f"Saved corr_adj.npy to: {corr_adj_path}")
print(f"File size: {corr_adj_path.stat().st_size / 1024:.2f} KB")

# Verify
loaded_corr = np.load(corr_adj_path)
print(f"\nVerification:")
print(f"  Shape: {loaded_corr.shape}")
print(f"  Non-zero entries: {(loaded_corr != 0).sum()}")

Saved corr_adj.npy to: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/corr_adj.npy
File size: 285.13 KB

Verification:
  Shape: (191, 191)
  Non-zero entries: 27735


## 7. Generate Graph Embeddings (MANDATORY)

**Important:** Graph embeddings are MANDATORY for Stockformer model.
- Model architecture uses `adjgat` as spatial positional encoding
- Added directly to features in `spatialAttention` layer: `x_ = x + adjgat`
- Required shape: [num_stocks, 128]

We'll use node2vec to generate 128-dimensional embeddings from the correlation adjacency matrix.

In [13]:
# Step 1: Convert correlation adjacency matrix to NetworkX graph
print("Creating graph from correlation adjacency matrix...")
print(f"Adjacency matrix shape: {corr_adj.shape}")
print(f"Non-zero edges: {(corr_adj != 0).sum()}")

# Create weighted graph from adjacency matrix
G = nx.Graph()

# Add nodes (stocks)
G.add_nodes_from(range(num_stocks))

# Add edges with correlation weights
edges_added = 0
for i in range(num_stocks):
    for j in range(i + 1, num_stocks):  # Only upper triangle to avoid duplicates
        weight = corr_adj[i, j]
        if weight != 0:  # Only add non-zero edges
            G.add_edge(i, j, weight=abs(weight))  # Use absolute value as weight
            edges_added += 1

print(f"\nGraph created:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")

# Check connectivity
if nx.is_connected(G):
    print(f"  Graph is connected ✓")
else:
    num_components = nx.number_connected_components(G)
    print(f"  Graph has {num_components} connected components")
    largest_cc = max(nx.connected_components(G), key=len)
    print(f"  Largest component size: {len(largest_cc)} nodes")

Creating graph from correlation adjacency matrix...
Adjacency matrix shape: (191, 191)
Non-zero edges: 27735

Graph created:
  Nodes: 191
  Edges: 13772
  Average degree: 144.21
  Graph is connected ✓


In [14]:
# Step 2: Generate node2vec embeddings
# Parameters matching original paper's struc2vec setup
print("\nGenerating node2vec embeddings...")
print("Parameters:")
print("  Dimensions: 128")
print("  Walk length: 80")
print("  Number of walks: 10")
print("  Workers: 4")
print("  p=1, q=1 (balanced BFS/DFS)")

# Initialize node2vec
node2vec_model = Node2Vec(
    G,
    dimensions=128,        # Embedding dimension (matches original 128_corr_struc2vec_adjgat.npy)
    walk_length=80,        # Length of random walk (original paper parameter)
    num_walks=10,          # Number of walks per node (original paper parameter)
    workers=4,             # Parallel workers
    p=1,                   # Return parameter (1 = balanced)
    q=1,                   # In-out parameter (1 = balanced)
    weight_key='weight',   # Use correlation weights
    quiet=False
)

print("\nTraining Word2Vec model on walks...")
# Fit the model (this generates random walks and trains embeddings)
model = node2vec_model.fit(
    window=10,             # Context size
    min_count=1,           # Minimum word count
    batch_words=4,         # Batch size
    epochs=20,             # Training epochs
    sg=1,                  # Skip-gram (1) vs CBOW (0)
    workers=4
)

print("Embedding generation complete!")


Generating node2vec embeddings...
Parameters:
  Dimensions: 128
  Walk length: 80
  Number of walks: 10
  Workers: 4
  p=1, q=1 (balanced BFS/DFS)


Computing transition probabilities:   0%|          | 0/191 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 3/3 [00:00<00:00,  3.34it/s]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:00<00:00,  3.34it/s]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:00<00:00,  3.42it/s]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:00<00:00,  3.16it/s]



Training Word2Vec model on walks...
Embedding generation complete!


In [15]:
# Step 3: Extract embeddings for all nodes
print("Extracting embeddings for all stocks...")

# Create embedding matrix [num_stocks, 128]
adjgat = np.zeros((num_stocks, 128))

for node_id in range(num_stocks):
    try:
        adjgat[node_id] = model.wv[str(node_id)]
    except KeyError:
        # If node not in vocabulary (isolated node), use zero vector
        print(f"Warning: Node {node_id} not in vocabulary, using zero vector")
        adjgat[node_id] = np.zeros(128)

print(f"\nEmbedding matrix shape: {adjgat.shape}")
print(f"  [stocks={adjgat.shape[0]}, dimensions={adjgat.shape[1]}]")
print(f"\nEmbedding statistics:")
print(f"  Min: {adjgat.min():.6f}")
print(f"  Max: {adjgat.max():.6f}")
print(f"  Mean: {adjgat.mean():.6f}")
print(f"  Std: {adjgat.std():.6f}")
print(f"  Zero vectors: {(np.abs(adjgat).sum(axis=1) == 0).sum()}")

Extracting embeddings for all stocks...

Embedding matrix shape: (191, 128)
  [stocks=191, dimensions=128]

Embedding statistics:
  Min: -0.546400
  Max: 0.672861
  Mean: -0.001186
  Std: 0.102895
  Zero vectors: 0


In [16]:
# Step 4: Save graph embeddings
adjgat_path = OUTPUT_DIR / '128_corr_struc2vec_adjgat.npy'

np.save(adjgat_path, adjgat)

print(f"Saved 128_corr_struc2vec_adjgat.npy to: {adjgat_path}")
print(f"File size: {adjgat_path.stat().st_size / 1024:.2f} KB")

# Verify
loaded_adjgat = np.load(adjgat_path)
print(f"\nVerification:")
print(f"  Shape: {loaded_adjgat.shape}")
print(f"  Expected: ({num_stocks}, 128)")
print(f"  Match: {loaded_adjgat.shape == (num_stocks, 128)}")

Saved 128_corr_struc2vec_adjgat.npy to: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31/128_corr_struc2vec_adjgat.npy
File size: 191.12 KB

Verification:
  Shape: (191, 128)
  Expected: (191, 128)
  Match: True


## 8. Summary & Verification

In [17]:
# Summary of generated files
print("=" * 80)
print("PHASE 4 PREPROCESSING COMPLETE")
print("=" * 80)

print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"\nGenerated files:")

files_to_check = [
    ('flow.npz', 'Wavelet-decomposed factors (low + high freq)'),
    ('trend_indicator.npz', 'Binary trend labels (up/down)'),
    ('corr_adj.npy', 'Stock correlation adjacency matrix'),
    ('128_corr_struc2vec_adjgat.npy', 'Graph embeddings (node2vec, 128-dim)'),
    ('label.csv', 'Daily returns (from Phase 3)'),
]

for filename, description in files_to_check:
    filepath = OUTPUT_DIR / filename
    if filepath.exists():
        size_mb = filepath.stat().st_size / 1024 / 1024
        size_str = f"{size_mb:.2f} MB" if size_mb > 1 else f"{filepath.stat().st_size / 1024:.2f} KB"
        print(f"  ✓ {filename:35s} - {description} ({size_str})")
    else:
        print(f"  ✗ {filename:35s} - MISSING")

print(f"\nData dimensions:")
print(f"  Time steps (wavelet): {approx_length}")
print(f"  Time steps (original): {num_dates}")
print(f"  Stocks: {num_stocks}")
print(f"  Factors: {num_factors}")
print(f"  Graph embedding dims: 128")

print(f"\n" + "=" * 80)
print("ALL REQUIRED FILES GENERATED!")
print("=" * 80)
print("\nREADY FOR PHASE 5: Model Configuration")
print("\nNext steps:")
print("1. Create config/Multitask_NIFTY200_Alpha158.conf")
print("   - Copy from config/Multitask_Stock.conf as template")
print("   - Update file paths to NIFTY200 data directory")
print("   - Set train/val/test splits")
print("\n2. Modify dataset loader: lib/Multitask_Stockformer_utils.py")
print("   - Update factor count: 360 → 22")
print("   - Update stock count: 255 → 191")
print("   - Fix hardcoded Alpha_360 paths to use config")
print("\n3. Test data loading")
print("   - Verify all files load correctly")
print("   - Check tensor shapes match model expectations")
print("\n4. Proceed to training (Phase 6)")

PHASE 4 PREPROCESSING COMPLETE

Output directory: /home/ubuntu/rajnish/Multitask-Stockformer/data/NIFTY200/Stock_NIFTY_2022-01-01_2024-08-31

Generated files:
  ✓ flow.npz                            - Wavelet-decomposed factors (low + high freq) (18.95 MB)
  ✓ trend_indicator.npz                 - Binary trend labels (up/down) (4.76 KB)
  ✓ corr_adj.npy                        - Stock correlation adjacency matrix (285.13 KB)
  ✓ 128_corr_struc2vec_adjgat.npy       - Graph embeddings (node2vec, 128-dim) (191.12 KB)
  ✓ label.csv                           - Daily returns (from Phase 3) (380.53 KB)

Data dimensions:
  Time steps (wavelet): 331
  Time steps (original): 660
  Stocks: 191
  Factors: 22
  Graph embedding dims: 128

ALL REQUIRED FILES GENERATED!

READY FOR PHASE 5: Model Configuration

Next steps:
1. Create config/Multitask_NIFTY200_Alpha158.conf
   - Copy from config/Multitask_Stock.conf as template
   - Update file paths to NIFTY200 data directory
   - Set train/val/test spli