In [1]:
!pip install plotly kaleido
!pip install seaborn

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from torch import Tensor
import math
from math import sqrt
from transformers import BertTokenizer, BertModel
import logging
from typing import Dict, List, Tuple
import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import sys
import json
from collections import defaultdict
import seaborn as sns
import plotly.graph_objects as go

def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

def set_seed(seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(42)
print_gpu_memory()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('forecasting.log'),
        logging.StreamHandler(sys.stdout)
    ]
)

GPU memory allocated: 0.00 MB
GPU memory reserved: 0.00 MB


In [3]:
class FullAttention(nn.Module):
    def __init__(
        self,
        mask_flag=True,
        factor=5,
        scale=None,
        attention_dropout=0.1,
        output_attention=False,
    ):
        super(FullAttention, self).__init__()
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
        scale = self.scale or 1.0 / sqrt(E)

        scores = torch.einsum("blhe,bshe->bhls", queries, keys)

        if self.mask_flag:
            if attn_mask is None:
                attn_mask = TriangularCausalMask(B, L, device=queries.device)
            scores.masked_fill_(attn_mask.mask, -np.inf)

        A = self.dropout(torch.softmax(scale * scores, dim=-1))
        V = torch.einsum("bhls,bshd->blhd", A, values)
        if self.output_attention:
            return V.contiguous(), A
        else:
            return V.contiguous(), None

class AttentionLayer(nn.Module):
    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None):
        super(AttentionLayer, self).__init__()
        d_keys = d_keys or (d_model // n_heads)
        d_values = d_values or (d_model // n_heads)

        self.inner_attention = attention
        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
        self.value_projection = nn.Linear(d_model, d_values * n_heads)
        self.out_projection = nn.Linear(d_values * n_heads, d_model)
        self.n_heads = n_heads

    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads

        queries = self.query_projection(queries).view(B, L, H, -1)
        keys = self.key_projection(keys).view(B, S, H, -1)
        values = self.value_projection(values).view(B, S, H, -1)

        out, attn = self.inner_attention(
            queries, keys, values, attn_mask, tau=tau, delta=delta
        )
        out = out.view(B, L, -1)
        return self.out_projection(out), attn

class EncoderLayer(nn.Module):
    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
        super(EncoderLayer, self).__init__()
        d_ff = d_ff or 4 * d_model
        self.attention = attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, attn_mask=None, tau=None, delta=None):
        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask, tau=tau, delta=delta)
        x = x + self.dropout(new_x)
        y = x = self.norm1(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))
        return self.norm2(x + y), attn

class Encoder(nn.Module):
    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
        super(Encoder, self).__init__()
        self.attn_layers = nn.ModuleList(attn_layers)
        self.conv_layers = (
            nn.ModuleList(conv_layers) if conv_layers is not None else None
        )
        self.norm = norm_layer

    def forward(self, x, attn_mask=None, tau=None, delta=None):
        attns = []
        if self.conv_layers is not None:
            for i, (attn_layer, conv_layer) in enumerate(
                zip(self.attn_layers, self.conv_layers)
            ):
                delta = delta if i == 0 else None
                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
                x = conv_layer(x)
                attns.append(attn)
            x, attn = self.attn_layers[-1](x, tau=tau, delta=None)
            attns.append(attn)
        else:
            for attn_layer in self.attn_layers:
                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
                attns.append(attn)

        if self.norm is not None:
            x = self.norm(x)
        return x, attns

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
        pe = torch.zeros(max_len, d_model).float()
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        pe.require_grad = False
        self.register_buffer("pe", pe)

    def forward(self, x):
        return self.pe[:, : x.size(1)]

In [4]:
class MultiHorizonFinancialDataset(Dataset):
    def __init__(self, time_series_path, text_path, input_window=5, forecast_horizon=1, max_len=390):
        self.input_window = input_window
        self.forecast_horizon = forecast_horizon
        self.max_len = max_len

        time_series_data = pd.read_csv(time_series_path)
        print("Processing time series data...")
        for col in ['open', 'high', 'low', 'close', 'volume']:
            time_series_data[col] = time_series_data[col].apply(
                lambda x: self._process_list(x, max_len)
            )

        print("Loading text data...")
        text_data = pd.read_csv(text_path)
        print("Merging datasets...")
        self.data = pd.merge(
            time_series_data,
            text_data,
            on='date',
            how='inner'
        )

        self.dates = self.data['date'].values
        self._process_features()
        self._create_forecast_targets()

    def _process_list(self, x, max_len):
        try:
            if isinstance(x, str):
                values = eval(x)
            else:
                values = x
            values = np.array(values, dtype=np.float32)
            if np.any(np.isnan(values)) or np.any(np.isinf(values)):
                values = np.nan_to_num(values, 0)

            if len(values) > max_len:
                return values[:max_len]
            elif len(values) < max_len:
                padding = np.full(max_len - len(values), values[-1])
                return np.concatenate([values, padding])
            return values
        except Exception as e:
            print(f"Error processing value: {str(e)}")
            return np.zeros(max_len)

    def _compute_feature_stats(self):
        try:
            self.feature_stats = {
                'mean': np.nanmean(self.features[:, :4, :], axis=(0, 2)),
                'std': np.nanstd(self.features[:, :4, :], axis=(0, 2)),
                'volume_mean': np.nanmean(self.features[:, 4, :]),
                'volume_std': np.nanstd(self.features[:, 4, :])
            }
            self.feature_stats['std'] = np.where(
                self.feature_stats['std'] == 0,
                1e-6,
                self.feature_stats['std']
            )
            if self.feature_stats['volume_std'] == 0:
                self.feature_stats['volume_std'] = 1e-6
            self._normalize_features()
        except Exception as e:
            raise RuntimeError(f"Error computing feature statistics: {str(e)}")

    def _normalize_features(self):
        for i in range(4):
            self.features[:, i, :] = (
                (self.features[:, i, :] - self.feature_stats['mean'][i]) /
                self.feature_stats['std'][i]
            )
        self.features[:, 4, :] = (
            (np.log1p(self.features[:, 4, :]) - np.log1p(self.feature_stats['volume_mean'])) /
            (self.feature_stats['volume_std'] + 1e-8)
        )

    def _process_features(self):
        print("Processing features...")
        self.features = []
        for _, row in self.data.iterrows():
            try:
                daily_features = np.stack([
                    row['open'],
                    row['high'],
                    row['low'],
                    row['close'],
                    row['volume']
                ])
                self.features.append(daily_features)
            except Exception as e:
                print(f"Error processing row: {str(e)}")
                daily_features = np.zeros((5, self.max_len))
                self.features.append(daily_features)
        self.features = np.array(self.features)
        self._compute_feature_stats()

    def _create_forecast_targets(self):
        print("Creating forecast targets...")
        self.targets = []
        self.valid_indices = []
        for i in range(len(self.features) - self.input_window - self.forecast_horizon):
            current_close = self.features[i + self.input_window - 1][3, -1]
            future_idx = i + self.input_window
            future_price = self.features[future_idx][3, -1]
            target = 1 if future_price > current_close else 0
            self.targets.append(target)
            self.valid_indices.append(i)
        self.targets = np.array(self.targets)

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        real_idx = self.valid_indices[idx]
        x = self.features[real_idx:real_idx + self.input_window]
        text_window = self.data['text'].iloc[real_idx:real_idx + self.input_window].tolist()
        target = self.targets[idx]
        dates = self.data['date'].iloc[real_idx:real_idx + self.input_window + self.forecast_horizon].tolist()
        return {
            "x_enc": torch.tensor(x, dtype=torch.float32),
            "text": text_window,
            "targets": torch.tensor(target, dtype=torch.long),
            "dates": dates
        }

def custom_collate_fn(batch):
    x_enc = torch.stack([item['x_enc'] for item in batch])
    targets = torch.tensor([item['targets'] for item in batch], dtype=torch.long)
    dates = [item['dates'] for item in batch]
    text = [item['text'] for item in batch]
    return {
        'x_enc': x_enc,
        'text': text,
        'targets': targets,
        'dates': dates
    }

In [5]:
class PatchEmbedding(nn.Module):
    def __init__(self, d_model, patch_len, stride, padding, dropout):
        super().__init__()
        self.padding_patch_layer = nn.ReplicationPad1d((0, padding))
        self.value_embedding = nn.Linear(patch_len * 5, d_model, bias=False)
        self.position_embedding = PositionalEmbedding(d_model, max_len=1024)
        self.dropout = nn.Dropout(dropout)
        self.patch_len = patch_len
        self.stride = stride
        self.padding = padding

    def forward(self, x):
        batch_size, window_size, n_features, n_minutes = x.shape
        x = x.reshape(-1, n_features, n_minutes)
        x = self.padding_patch_layer(x)
        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
        num_patches = x.size(2)
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(batch_size * window_size, num_patches, -1)
        x = self.value_embedding(x)
        x = x + self.position_embedding(x)
        x = self.dropout(x)
        x = x.reshape(batch_size, window_size, num_patches, -1)
        return x, n_features

class PatchTSTWithBERT(nn.Module):
    def __init__(self, config, dataset, bert_model="bert-base-uncased"):
        super().__init__()
        self.config = config
        self.model_config = config['models']['patchtst']

        if not hasattr(dataset, 'features'):
            raise ValueError("Dataset must have 'features' attribute")

        self.enc_in = dataset.features.shape[1]  # 5
        self.num_class = 2
        self.max_seq_len = dataset.features.shape[2]
        self.forecast_horizon = 1
        self.input_window = config['input_window']

        patch_len = self.model_config['patching']['patch_len']
        stride = self.model_config['patching']['stride']
        padding = self.model_config['patching']['padding']
        self.n_patches = ((self.max_seq_len + padding) - patch_len) // stride + 1
        self.projection_dim = self.input_window * self.n_patches * self.model_config['d_model']

        print(f"Input features shape: {dataset.features.shape}")
        print(f"Patch length: {patch_len}")
        print(f"Stride: {stride}")
        print(f"Number of patches: {self.n_patches}")
        print(f"Final projection dim: {self.projection_dim}")

        self.patch_embedding = PatchEmbedding(
            d_model=self.model_config['d_model'],
            patch_len=patch_len,
            stride=stride,
            padding=padding,
            dropout=config['training']['dropout'],
        )

        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        FullAttention(False, factor=3, attention_dropout=config['training']['dropout']),
                        self.model_config['d_model'],
                        self.model_config['n_heads'],
                    ),
                    self.model_config['d_model'],
                    self.model_config['d_ff'],
                    dropout=config['training']['dropout'],
                    activation="gelu",
                )
                for _ in range(self.model_config['e_layers'])
            ],
            norm_layer=nn.LayerNorm(self.model_config['d_model']),
        )

        self.prediction_head = nn.Sequential(
            nn.Linear(self.projection_dim, self.model_config['d_model']),
            nn.ReLU(),
            nn.Dropout(config['training']['dropout']),
            nn.Linear(self.model_config['d_model'], self.num_class)
        )

        self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model)
        self.bert = BertModel.from_pretrained(bert_model)

        if config.get('freeze_bert', True):
            for param in self.bert.parameters():
                param.requires_grad = False

        self.bert_projection = nn.Linear(768, self.model_config['d_model'])

    def forward(self, x, prompts=None):
        B, W, n_features, n_minutes = x.shape
        x_patched, _ = self.patch_embedding(x)  # [B, W, P, D]

        if prompts is not None:
            text_embeds = self.encode_prompts(prompts)  # [B, W, D]
            x_patched = x_patched + text_embeds.unsqueeze(2).expand(-1, -1, x_patched.size(2), -1)

        B, W, P, D = x_patched.shape
        x_reshaped = x_patched.reshape(B * W, P, D)
        encoded_output, _ = self.encoder(x_reshaped)
        encoded_output = encoded_output.reshape(B, W, P, D)

        x_encoded = encoded_output.reshape(B, W * P * D)
        output = self.prediction_head(x_encoded)  # [B, 2]
        return output

    def encode_prompts(self, prompts):
        batch_size = len(prompts)
        window_size = len(prompts[0])
        all_embeddings = []

        for batch_idx in range(batch_size):
            window_embeddings = []
            for window_idx in range(window_size):
                encoded = self.bert_tokenizer(
                    prompts[batch_idx][window_idx],
                    padding='max_length',
                    truncation=True,
                    max_length=128,
                    return_tensors="pt"
                ).to(next(self.bert.parameters()).device)

                with torch.no_grad():
                    outputs = self.bert(**encoded)
                    embedding = outputs.last_hidden_state[:, 0, :]
                    window_embeddings.append(embedding)

            batch_embeddings = torch.cat(window_embeddings, dim=0)
            all_embeddings.append(batch_embeddings)

        all_embeddings = torch.stack(all_embeddings)  # [B, W, 768]
        return self.bert_projection(all_embeddings)

In [6]:
class MetricTracker:
    def __init__(self):
        self.metrics = defaultdict(list)

    def update(self, metrics_dict):
        for k, v in metrics_dict.items():
            self.metrics[k].append(v)

    def get_latest(self, metric_name):
        return self.metrics[metric_name][-1]

    def get_history(self, metric_name):
        return self.metrics[metric_name]

class SingleHorizonTrainer:
    def __init__(self, model, train_loader, val_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(
            model.parameters(),
            lr=config['lr'],
            weight_decay=config['weight_decay']
        )

        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.5,
            patience=5,
            verbose=True
        )

    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0

        for batch in tqdm.tqdm(self.train_loader):
            self.optimizer.zero_grad()
            x_enc = batch['x_enc'].to(self.device)
            targets = batch['targets'].to(self.device)
            text = batch['text']

            outputs = self.model(x_enc, text)
            loss = self.criterion(outputs, targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['training']['gradient_clip'])
            self.optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(self.train_loader)
        return {'loss': avg_loss}

    def validate(self):
        self.model.eval()
        total_loss = 0
        all_predictions = []
        all_targets = []

        with torch.no_grad():
            for batch in self.val_loader:
                x_enc = batch['x_enc'].to(self.device)
                targets = batch['targets'].to(self.device)
                text = batch['text']

                outputs = self.model(x_enc, text)
                loss = self.criterion(outputs, targets)
                total_loss += loss.item()

                preds = outputs.argmax(dim=1).cpu().numpy()
                all_predictions.extend(preds)
                all_targets.extend(targets.cpu().numpy())

        avg_loss = total_loss / len(self.val_loader)
        accuracy = accuracy_score(all_targets, all_predictions)
        f1 = f1_score(all_targets, all_predictions, average='weighted')

        return {'loss': avg_loss, 'accuracy': accuracy, 'f1': f1, 'predictions': all_predictions, 'targets': all_targets}

def create_temporal_split(dataset, val_ratio=0.2, window_size=5):
    total_size = len(dataset)
    split_idx = int((1 - val_ratio) * total_size)
    split_idx = max(split_idx - window_size, 0)

    train_indices = list(range(split_idx))
    val_indices = list(range(split_idx + window_size, total_size))

    train_dataset = Subset(dataset, train_indices)
    val_dataset = Subset(dataset, val_indices)

    train_start = dataset.dates[dataset.valid_indices[0]]
    train_end = dataset.dates[dataset.valid_indices[split_idx]] if split_idx < len(dataset.valid_indices) else dataset.dates[dataset.valid_indices[-1]]
    val_start = dataset.dates[dataset.valid_indices[split_idx + window_size]] if (split_idx + window_size) < len(dataset.valid_indices) else dataset.dates[dataset.valid_indices[-1]]
    val_end = dataset.dates[dataset.valid_indices[-1]]

    print(f"Training period: {train_start} to {train_end}")
    print(f"Validation period: {val_start} to {val_end}")

    return train_dataset, val_dataset

def plot_single_horizon_results(predictions, targets):
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.plot(targets, label='Actual (0=Down, 1=Up)', alpha=0.6)
    ax.plot(predictions, label='Predicted (0=Down, 1=Up)', alpha=0.6)
    ax.set_title('Single Horizon Predictions')
    ax.set_xlabel('Time Step')
    ax.set_ylabel('Direction')
    ax.legend()
    acc = accuracy_score(targets, predictions)
    ax.text(0.02, 0.98, f'Accuracy: {acc:.4f}',
            transform=ax.transAxes,
            verticalalignment='top')
    plt.tight_layout()
    return fig

def plot_interactive_predictions(predictions, targets):
    correct = (predictions == targets)
    accuracy = np.mean(correct)
    timesteps = np.arange(len(predictions))
    direction_map = {0: "Down", 1: "Up"}
    pred_directions = [direction_map[p] for p in predictions]
    target_directions = [direction_map[t] for t in targets]
    colors = np.where(correct, 'green', 'red')

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=timesteps,
        y=predictions,
        mode='markers',
        marker=dict(
            size=12,
            color=colors,
            opacity=0.7,
            line=dict(width=1, color='DarkSlateGrey')
        ),
        text=[
            f"Step: {ts}<br>Predicted: {pd}<br>Actual: {td}"
            for ts, pd, td in zip(timesteps, pred_directions, target_directions)
        ],
        hovertemplate="<b>%{text}</b><extra></extra>",
        name="Predictions"
    ))

    fig.add_hline(y=0.5, line_color="gray", opacity=0.3)
    fig.update_layout(
        title={
            'text': f"Stock Direction Predictions (Accuracy: {accuracy*100:.2f}%)",
            'y':0.9,
            'x':0.5,
            'xanchor':'center',
            'yanchor':'top'
        },
        xaxis_title="Time Step",
        yaxis_title="Direction (0=Down, 1=Up)",
        template="plotly_dark"
    )
    return fig


In [7]:
if __name__ == "__main__":
    import torch
    import logging
    from torch.utils.data import DataLoader
    import numpy as np
    from pathlib import Path

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('forecast_training.log'),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger(__name__)

    config = {
        'models': {
            'patchtst': {
                'd_model': 128,
                'patching': {
                    'patch_len': 30,
                    'stride': 15,
                    'padding': 15
                },
                'n_heads': 4,
                'd_ff': 512,
                'e_layers': 3,
            }
        },
        'training': {
            'dropout': 0.1,
            'batch_size': 16,
            'epochs': 20,
            'gradient_clip': 1.0,
            'early_stopping_patience': 5,
            'validation_split': 0.2,
        },
        'forecast_horizon': 1,
        'input_window': 5,
        'lr': 1e-4,
        'weight_decay': 1e-4,
        'freeze_bert': True
    }

    os.makedirs('checkpoints', exist_ok=True)
    os.makedirs('results', exist_ok=True)

    try:
        logger.info("Creating dataset...")
        dataset = MultiHorizonFinancialDataset(
            time_series_path='AAPL_train_data_aggregated.csv',
            text_path='AAPL_tweets_train.csv',
            input_window=config['input_window'],
            forecast_horizon=config['forecast_horizon']
        )

        logger.info("Creating train/val split using temporal split...")
        train_dataset, val_dataset = create_temporal_split(
            dataset,
            val_ratio=config['training']['validation_split'],
            window_size=config['input_window']
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=config['training']['batch_size'],
            shuffle=True,
            collate_fn=custom_collate_fn,
            num_workers=2,
            pin_memory=True
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=config['training']['batch_size'],
            shuffle=False,
            collate_fn=custom_collate_fn,
            num_workers=2,
            pin_memory=True
        )

        logger.info("Initializing model...")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = PatchTSTWithBERT(config=config, dataset=dataset).to(device)

        trainer = SingleHorizonTrainer(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            config=config
        )

        logger.info("Starting training...")
        best_val_loss = float('inf')
        patience_counter = 0

        train_losses = []
        val_losses = []
        val_accuracies = []
        val_f1_scores = []

        for epoch in range(config['training']['epochs']):
            train_metrics = trainer.train_epoch(epoch)
            logger.info(f"Epoch {epoch + 1}/{config['training']['epochs']}")
            logger.info(f"Train Loss: {train_metrics['loss']:.4f}")

            val_metrics = trainer.validate()
            logger.info(f"Validation Loss: {val_metrics['loss']:.4f}")
            logger.info(f"Validation Accuracy: {val_metrics['accuracy']:.4f}, Validation F1: {val_metrics['f1']:.4f}")

            train_losses.append(train_metrics['loss'])
            val_losses.append(val_metrics['loss'])
            val_accuracies.append(val_metrics['accuracy'])
            val_f1_scores.append(val_metrics['f1'])

            # Scheduler step on validation loss
            trainer.scheduler.step(val_metrics['loss'])

            if val_metrics['loss'] < best_val_loss:
                best_val_loss = val_metrics['loss']
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': trainer.optimizer.state_dict(),
                    'val_loss': val_metrics['loss'],
                    'config': config,
                }, 'checkpoints/best_model.pth')
                logger.info(f"Saved new best model with val_loss: {val_metrics['loss']:.4f}")
            else:
                patience_counter += 1
                if patience_counter >= config['training']['early_stopping_patience']:
                    logger.info("Early stopping triggered")
                    break

        logger.info("Performing final evaluation with best model...")
        # Load best model
        checkpoint = torch.load('checkpoints/best_model.pth', map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()

        final_predictions = []
        final_targets = []
        with torch.no_grad():
            for batch in val_loader:
                x_enc = batch['x_enc'].to(device)
                text = batch['text']
                targets = batch['targets']
                outputs = model(x_enc, text)
                preds = outputs.argmax(dim=1).cpu().numpy()
                final_predictions.extend(preds)
                final_targets.extend(targets.numpy())

        final_predictions = np.array(final_predictions)
        final_targets = np.array(final_targets)
        final_acc = accuracy_score(final_targets, final_predictions)
        final_f1 = f1_score(final_targets, final_predictions)

        final_metrics = {
            'accuracy': float(final_acc),
            'f1': float(final_f1)
        }

        with open('results/final_metrics.json', 'w') as f:
            json.dump(final_metrics, f, indent=4)

        # Plot and save train/val loss
        plt.figure(figsize=(10,6))
        plt.plot(train_losses, label='Train Loss', marker='o')
        plt.plot(val_losses, label='Validation Loss', marker='o')
        plt.title("Training and Validation Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('results/train_val_loss.png')
        plt.close()

        # Confusion Matrix
        cm = confusion_matrix(final_targets, final_predictions)
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.tight_layout()
        plt.savefig('results/confusion_matrix.png')
        plt.close()

        # Plot and save final interactive predictions
        interactive_fig = plot_interactive_predictions(final_predictions, final_targets)
        # Save as HTML
        interactive_fig.write_html('results/final_forecast_interactive.html')
        # Save as PNG (Requires kaleido: pip install kaleido)
        interactive_fig.write_image('results/final_forecast_interactive.png')

        # Optionally, also save the original static plot
        fig = plot_single_horizon_results(final_predictions, final_targets)
        plt.savefig('results/final_forecast_results.png')
        plt.close()

        logger.info("All results saved. Training completed successfully!")

    except Exception as e:
        logger.error(f"Error during training: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())
        raise


Processing time series data...
Loading text data...
Merging datasets...
Processing features...
Creating forecast targets...
Training period: 2020-01-02 to 2021-08-18
Validation period: 2021-08-25 to 2022-01-21
Input features shape: (525, 5, 390)
Patch length: 30
Stride: 15
Number of patches: 26
Final projection dim: 16640


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 26/26 [01:49<00:00,  4.20s/it]
100%|██████████| 26/26 [01:45<00:00,  4.04s/it]
100%|██████████| 26/26 [01:45<00:00,  4.06s/it]
100%|██████████| 26/26 [01:45<00:00,  4.05s/it]
100%|██████████| 26/26 [01:45<00:00,  4.04s/it]
100%|██████████| 26/26 [01:45<00:00,  4.06s/it]
100%|██████████| 26/26 [01:45<00:00,  4.04s/it]
100%|██████████| 26/26 [01:45<00:00,  4.04s/it]
100%|██████████| 26/26 [01:45<00:00,  4.05s/it]
100%|██████████| 26/26 [01:44<00:00,  4.03s/it]
  checkpoint = torch.load('checkpoints/best_model.pth', map_location=device)
