In [49]:
from momentfm import MOMENTPipeline
# linear_prob
model_lp = MOMENTPipeline.from_pretrained(
                "/hy-tmp/better464/MOMENT-1-large",
                model_kwargs={
                    'task_name': 'forecasting',
                    'forecast_horizon': 50,
                    'head_dropout': 0.1,
                    'weight_decay': 0,
                    'freeze_encoder': True,  # Freeze the patch embedding layer
                    'freeze_embedder': True,  # Freeze the transformer encoder
                    'freeze_head': False,  # The linear forecasting head must be trained
                },
            )

Loading weights from local directory


In [50]:
print(model_lp)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  

In [51]:
model_lp.init()
print(model_lp)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  

In [40]:
import torch
def format_size(size):
    # 对总参数量做格式优化
    K, M, B = 1e3, 1e6, 1e9
    if size == 0:
        return '0'
    elif size < M:
        return f"{size / K:.1f}K"
    elif size < B:
        return f"{size / M:.1f}M"
    else:
        return f"{size / B:.1f}B"

def get_pytorch_model_info(model: torch.nn.Module) -> (dict, list):
    """
    输入一个PyTorch Model对象，返回模型的总参数量（格式化为易读格式）以及每一层的名称、尺寸、精度、参数量、是否可训练和层的类别。

    :param model: PyTorch Model
    :return: (总参数量信息, 参数列表[包括每层的名称、尺寸、数据类型、参数量、是否可训练和层的类别])
    """
    params_list = []
    total_params = 0
    total_params_non_trainable = 0

    for name, param in model.named_parameters():
        # 获取参数所属层的名称
        layer_name = name.split('.')[0]
        # 获取层的对象
        layer = dict(model.named_modules())[layer_name]
        # 获取层的类名
        layer_class = layer.__class__.__name__

        params_count = param.numel()
        trainable = param.requires_grad
        params_list.append({
            'tensor': name,
            'layer_class': layer_class,
            'shape': str(list(param.size())),
            'precision': str(param.dtype).split('.')[-1],
            'params_count': str(params_count),
            'trainable': str(trainable),
        })
        total_params += params_count
        if not trainable:
            total_params_non_trainable += params_count

    total_params_trainable = total_params - total_params_non_trainable
    
    total_params_info = {
        'total_params': format_size(total_params),
        'total_params_trainable': format_size(total_params_trainable),
        'total_params_non_trainable': format_size(total_params_non_trainable)
    }

    return total_params_info, params_list

def filter_dic(it):
    ret_list = []
    for tup in it:
        if tup['trainable'] == 'True':
            ret_list.append(tup)
    return ret_list

In [52]:
# model_lp.init()
# filter_dic(get_pytorch_model_info(model_lp))
total_params_info, param_list = get_pytorch_model_info(model_lp)
filter_dic(param_list)

[{'tensor': 'head.linear.weight',
  'layer_class': 'ForecastingHead',
  'shape': '[50, 65536]',
  'precision': 'float32',
  'params_count': '3276800',
  'trainable': 'True'},
 {'tensor': 'head.linear.bias',
  'layer_class': 'ForecastingHead',
  'shape': '[50]',
  'precision': 'float32',
  'params_count': '50',
  'trainable': 'True'}]

In [42]:
total_params_info, param_list = get_pytorch_model_info(model_test)
filter_dic(param_list)

[]

In [43]:
model_lp.init()
total_params_info, param_list = get_pytorch_model_info(model_lp)
filter_dic(param_list)

[{'tensor': 'head.linear.weight',
  'layer_class': 'ForecastingHead',
  'shape': '[192, 65536]',
  'precision': 'float32',
  'params_count': '12582912',
  'trainable': 'True'},
 {'tensor': 'head.linear.bias',
  'layer_class': 'ForecastingHead',
  'shape': '[192]',
  'precision': 'float32',
  'params_count': '192',
  'trainable': 'True'}]

In [44]:
model_test.init()
total_params_info, param_list = get_pytorch_model_info(model_test)
filter_dic(param_list)

[{'tensor': 'head.linear.weight',
  'layer_class': 'ForecastingHead',
  'shape': '[192, 65536]',
  'precision': 'float32',
  'params_count': '12582912',
  'trainable': 'True'},
 {'tensor': 'head.linear.bias',
  'layer_class': 'ForecastingHead',
  'shape': '[192]',
  'precision': 'float32',
  'params_count': '192',
  'trainable': 'True'}]

In [17]:
# 模型的参数量  341248520
# model_lp.init() 
total_params = sum(param.numel() for param in model_lp.parameters())
print(f'模型总参数量为：{total_params}')
# 修改bias变成可微调
# for name, param in model_lp.named_parameters():
#     if "bias" in name:  # 如果不是bias
#         param.requires_grad = True  # 如果之前本来就不反传了
#     else:
#         pass

requires_grad_num = 0
for name, param in model_lp.named_parameters():
    if param.requires_grad == False:  # 不进行反传的
        pass
    else:  # 进行反传的
        requires_grad_num += param.numel()
pct_grad = requires_grad_num / total_params * 100
print(f'当前模型可训练的参数量:{requires_grad_num}, 占总可训练的参数量的{pct_grad}%')

模型总参数量为：341248520
当前模型可训练的参数量:8200, 占总可训练的参数量的0.002402940824475957%


In [15]:
# 模型的参数量  341248520
model_lp.init() 
total_params = sum(param.numel() for param in model_lp.parameters())
print(f'模型总参数量为：{total_params}')
# 修改bias变成可微调
# for name, param in model_lp.named_parameters():
#     if "bias" in name:  # 如果不是bias
#         param.requires_grad = True  # 如果之前本来就不反传了
#     else:
#         pass

requires_grad_num = 0
for name, param in model_lp.named_parameters():
    if param.requires_grad == False:  # 不进行反传的
        pass
    else:  # 进行反传的
        requires_grad_num += param.numel()
pct_grad = requires_grad_num / total_params * 100
print(f'当前模型可训练的参数量:{requires_grad_num}, 占总可训练的参数量的{pct_grad}%')

模型总参数量为：353823424
当前模型可训练的参数量:12583104, 占总可训练的参数量的3.5563230545188547%


In [None]:
from momentfm import MOMENTPipeline
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random
import numpy as np
import os
import sys
import numpy as np
import torch
import torch.cuda.amp
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import argparse
from momentfm.utils.utils import control_randomness
from momentfm.data.informer_dataset import InformerDataset
from momentfm.utils.forecasting_metrics import get_forecasting_metrics



class MOMENT_Trainer:
    def __init__(self, seed, batch_size, epochs, forecast_horizon, mode, output_path):
        # initialize ptbxl classification dataset
        self.mode = mode
        self.forecast_horizon = forecast_horizon
        self.batch_size = batch_size
        self.epochs = epochs
        self.output_path = output_path
        self.train_dataset = InformerDataset(data_split="train", random_seed=seed, forecast_horizon=self.forecast_horizon)
        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

        self.test_dataset = InformerDataset(data_split="test", random_seed=seed, forecast_horizon=self.forecast_horizon)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=True)
         #create log file to store training logs 
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)
        self.log_file = open(os.path.join(self.output_path, f'log_{self.mode}.txt'), 'w')
        sys.stdout = self.log_file
        # linear probing: only train classification head
        # finetuning: train both encoder and classification head
        # unsupervised learning: train SVM on top of MOMENT embeddings
        if self.mode == 'linear_probing':
            self.model = MOMENTPipeline.from_pretrained(
                "/hy-tmp/better464/MOMENT-1-large",
                model_kwargs={
                    'task_name': 'forecasting',
                    'forecast_horizon': self.forecast_horizon,
                    'head_dropout': 0.1,
                    'weight_decay': 0,
                    'freeze_encoder': True,  # Freeze the patch embedding layer
                    'freeze_embedder': True,  # Freeze the transformer encoder
                    'freeze_head': False,  # The linear forecasting head must be trained
                },
            )

        self.model.init()
        print('Model initialized, training mode: ', self.mode)
        #using cross MSE loss for forecasting
        self.criterion = torch.nn.MSELoss()

        if self.mode == 'linear_probing':
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)
            # Create a OneCycleLR scheduler
            max_lr = 1e-4
            total_steps = len(self.train_loader) * self.epochs
            self.scheduler = OneCycleLR(self.optimizer, max_lr=max_lr, total_steps=total_steps, pct_start=0.3)
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        

    
    def train(self):
        for epoch in range(self.epochs):
            print(f'Epoch {epoch+1}/{self.epochs}')
            # self.log_file.write(f'Epoch {epoch+1}/{self.epochs}\n')
            self.epoch = epoch + 1

            if self.mode == 'linear_probing':
                self.train_epoch_lp()
                self.evaluate_epoch()
            else:
                raise ValueError('Invalid mode, please choose linear_probing, full_finetuning, or unsupervised_representation_learning')

                
    ####################################Training function##################################
    def train_epoch_lp(self):
        '''
        Train only forecasting head-linear_probing
        '''
        self.model.train()
        self.model.to(self.device)

        # Move the model to the GPU
        self.model = self.model.to(self.device)

        # Move the loss function to the GPU
        self.criterion = self.criterion.to(self.device)
        
        # Enable mixed precision training
        scaler = torch.cuda.amp.GradScaler()
        
        # Gradient clipping value
        max_norm = 5.0
        
        losses = []
        for timeseries, forecast, input_mask in tqdm(self.train_loader, total=len(self.train_loader)):
            # Move the data to the GPU
            timeseries = timeseries.float().to(self.device)
            input_mask = input_mask.to(self.device)
            forecast = forecast.float().to(self.device)
    
            with torch.cuda.amp.autocast():
                output = self.model(timeseries, input_mask)
                
            loss = self.criterion(output.forecast, forecast)
    
            # Scales the loss for mixed precision training
            scaler.scale(loss).backward()
    
            # Clip gradients
            scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm)
    
            scaler.step(self.optimizer)
            scaler.update()
            self.optimizer.zero_grad(set_to_none=True)
    
            losses.append(loss.item())
    
        losses = np.array(losses)
        average_loss = np.average(losses)
        print(f"Train loss: {average_loss:.3f}")
        # Step the learning rate scheduler
        self.scheduler.step()
  

    ####################################Evaluate function##################################
    def evaluate_epoch(self):        
        # Evaluate the model on the test split
        trues, preds, histories, losses = [], [], [], []
        self.model.eval()
        with torch.no_grad():
            for timeseries, forecast, input_mask in tqdm(self.test_loader, total=len(self.test_loader)):
            # Move the data to the GPU
                timeseries = timeseries.float().to(self.device)
                input_mask = input_mask.to(self.device)
                forecast = forecast.float().to(self.device)
    
                with torch.cuda.amp.autocast():
                    output = self.model(timeseries, input_mask)
                
                loss = self.criterion(output.forecast, forecast)                
                losses.append(loss.item())
    
                trues.append(forecast.detach().cpu().numpy())
                preds.append(output.forecast.detach().cpu().numpy())
                histories.append(timeseries.detach().cpu().numpy())
        
        losses = np.array(losses)
        average_loss = np.average(losses)
        self.model.train()
    
        trues = np.concatenate(trues, axis=0)
        preds = np.concatenate(preds, axis=0)
        histories = np.concatenate(histories, axis=0)
        
        metrics = get_forecasting_metrics(y=trues, y_hat=preds, reduction='mean')
    
        print(f"Test Loss: {average_loss:.3f}| Test MSE: {metrics.mse:.3f} | Test MAE: {metrics.mae:.3f}")

if __name__ == '__main__':
    seed = 13
    control_randomness(seed)
    batch_size = 16
    epochs = 2
    forecast_horizon = 192
    output_path = '/root/moment/tuning_exp/logs'
    mode = 'linear_probing'
    trainer = MOMENT_Trainer(seed, batch_size, epochs, forecast_horizon, mode, output_path)
    trainer.train()
    trainer.log_file.close()

100%|██████████| 497/497 [03:11<00:00,  2.59it/s]
100%|██████████| 169/169 [01:04<00:00,  2.63it/s]
100%|██████████| 497/497 [03:24<00:00,  2.43it/s]
100%|██████████| 169/169 [01:04<00:00,  2.64it/s]


In [58]:
from momentfm import MOMENTPipeline
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random
import numpy as np
import os
import numpy as np
import torch
import torch.cuda.amp
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import argparse
from momentfm.utils.utils import control_randomness
from momentfm.data.informer_dataset import InformerDataset
from momentfm.utils.forecasting_metrics import get_forecasting_metrics



class MOMENT_Trainer:
    def __init__(self, seed, batch_size, forecast_horizon, mode):
        # initialize ptbxl classification dataset
        self.mode = mode
        self.forecast_horizon = forecast_horizon
        self.batch_size = batch_size
        train_dataset = InformerDataset(data_split="train", random_seed=seed, forecast_horizon=self.forecast_horizon)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        test_dataset = InformerDataset(data_split="test", random_seed=seed, forecast_horizon=self.forecast_horizon)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

        # linear probing: only train classification head
        # finetuning: train both encoder and classification head
        # unsupervised learning: train SVM on top of MOMENT embeddings
        if self.mode == 'linear_probing':
            self.model = MOMENTPipeline.from_pretrained(
                "/hy-tmp/better464/MOMENT-1-large",
                model_kwargs={
                    'task_name': 'forecasting',
                    'forecast_horizon': 192,
                    'head_dropout': 0.1,
                    'weight_decay': 0,
                    'freeze_encoder': True,  # Freeze the patch embedding layer
                    'freeze_embedder': True,  # Freeze the transformer encoder
                    'freeze_head': False,  # The linear forecasting head must be trained
                },
            )

        self.model.init()
        print('Model initialized, training mode: ', self.mode)
        #using cross MSE loss for forecasting
        criterion = torch.nn.MSELoss()

        if self.mode == 'linear_probing':
            optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        cur_epoch = 0
        max_epoch = 1

        # Move the model to the GPU
        self.model = self.model.to(device)

        # Move the loss function to the GPU
        criterion = criterion.to(device)
        
        # Enable mixed precision training
        scaler = torch.cuda.amp.GradScaler()
        
        # Create a OneCycleLR scheduler
        max_lr = 1e-4
        total_steps = len(train_loader) * max_epoch
        scheduler = OneCycleLR(optimizer, max_lr=max_lr, total_steps=total_steps, pct_start=0.3)
        
        # Gradient clipping value
        max_norm = 5.0
        
        while cur_epoch < max_epoch:
            losses = []
            for timeseries, forecast, input_mask in tqdm(train_loader, total=len(train_loader)):
                # Move the data to the GPU
                timeseries = timeseries.float().to(device)
                input_mask = input_mask.to(device)
                forecast = forecast.float().to(device)
        
                with torch.cuda.amp.autocast():
                    output = self.model(timeseries, input_mask)
                
                loss = criterion(output.forecast, forecast)
        
                # Scales the loss for mixed precision training
                scaler.scale(loss).backward()
        
                # Clip gradients
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm)
        
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
        
                losses.append(loss.item())
        
            losses = np.array(losses)
            average_loss = np.average(losses)
            print(f"Epoch {cur_epoch}: Train loss: {average_loss:.3f}")
        
            # Step the learning rate scheduler
            scheduler.step()
            cur_epoch += 1
            
            # Evaluate the model on the test split
            trues, preds, histories, losses = [], [], [], []
            self.model.eval()
            with torch.no_grad():
                for timeseries, forecast, input_mask in tqdm(test_loader, total=len(test_loader)):
                # Move the data to the GPU
                    timeseries = timeseries.float().to(device)
                    input_mask = input_mask.to(device)
                    forecast = forecast.float().to(device)
        
                    with torch.cuda.amp.autocast():
                        output = self.model(timeseries, input_mask)
                    
                    loss = criterion(output.forecast, forecast)                
                    losses.append(loss.item())
        
                    trues.append(forecast.detach().cpu().numpy())
                    preds.append(output.forecast.detach().cpu().numpy())
                    histories.append(timeseries.detach().cpu().numpy())
            
            losses = np.array(losses)
            average_loss = np.average(losses)
            self.model.train()
        
            trues = np.concatenate(trues, axis=0)
            preds = np.concatenate(preds, axis=0)
            histories = np.concatenate(histories, axis=0)
            
            metrics = get_forecasting_metrics(y=trues, y_hat=preds, reduction='mean')
        
            print(f"Epoch {cur_epoch}: Test MSE: {metrics.mse:.3f} | Test MAE: {metrics.mae:.3f}")
        
        # 模型的参数量  341248520
        # total_params = sum(param.numel() for param in  self.model.parameters())
        # print(f'模型总参数量为：{total_params}')

        # # 修改bias变成可微调
        # for name, param in self.model.named_parameters():
        #     if "bias" in name:  # 如果不是bias
        #         param.requires_grad = True  # 如果之前本来就不反传了
        #     else:
        #         pass

        # requires_grad_num = 0
        # for name, param in self.model.named_parameters():
        #     if param.requires_grad == False:  # 不进行反传的
        #         pass
        #     else:  # 进行反传的
        #         requires_grad_num += param.numel()
        # pct_grad = requires_grad_num / total_params * 100
        # print(f'当前模型可训练的参数量:{requires_grad_num}, 占总可训练的参数量的{pct_grad}%')

        
        # print(get_pytorch_model_info(self.model))
if __name__ == '__main__':
    seed = 13
    control_randomness(seed)
    batch_size = 8
    forecast_horizon = 192
    mode = 'linear_probing'
    MOMENT_Trainer(seed, batch_size, forecast_horizon, mode)

Loading weights from local directory
Model initialized, training mode:  linear_probing


100%|██████████| 993/993 [02:55<00:00,  5.65it/s]


Epoch 0: Train loss: 0.467


100%|██████████| 337/337 [00:57<00:00,  5.87it/s]


Epoch 1: Test MSE: 0.421 | Test MAE: 0.431


In [59]:
import matplotlib.pyplot as plt

# Assuming histories, trues, and preds are your lists containing the data
# Extracting the first data point

channel_idx = np.random.randint(0, 7) # There are 7 channels in this dataset
time_index = np.random.randint(0, trues.shape[0]) 

history = histories[time_index, channel_idx, :] 
true = trues[time_index, channel_idx, :]
pred = preds[time_index, channel_idx, :]

plt.figure(figsize=(12, 4))

# Plotting the first time series from history
plt.plot(range(len(history)), history, label='History (512 timesteps)', c='darkblue')

# Plotting ground truth and prediction
num_forecasts = len(true)

offset = len(history)
plt.plot(range(offset, offset + len(true)), true, label='Ground Truth (192 timesteps)', color='darkblue', linestyle='--', alpha=0.5)
plt.plot(range(offset, offset + len(pred)), pred, label='Forecast (192 timesteps)', color='red', linestyle='--')

plt.title(f"ETTh1 (Hourly) -- (idx={time_index}, channel={channel_idx})", fontsize=18)
plt.xlabel('Time', fontsize=14)
plt.ylabel('Value', fontsize=14)
plt.legend(fontsize=14)
plt.show()

NameError: name 'trues' is not defined