In [1]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from momentfm import MOMENTPipeline

lora_config = LoraConfig(
                                        r=64,
                                        lora_alpha=32,
                                        target_modules=["q", "v"],
                                        lora_dropout=0.05,
                                        )

model_base = MOMENTPipeline.from_pretrained(
                "/hy-tmp/better464/MOMENT-1-large", #hy-tmp/better464/MOMENT-1-large
                model_kwargs={
                    'task_name': 'forecasting',
                    'forecast_horizon': 20,
                    'head_dropout': 0.1,
                    'weight_decay': 0,
                    'freeze_encoder': True,  # Freeze the patch embedding layer
                    'freeze_embedder': True,  # Freeze the transformer encoder
                    'freeze_head': False,  # The linear forecasting head must be trained
                },
                #use_safetensors = False
            )
# 观察当前模型的参数量
total_params = sum(param.numel() for param in model_base.parameters())
print(f'模型总参数量为：{total_params}')
requires_grad_num = 0
for name, param in model_base.named_parameters():
    if param.requires_grad == False:  # 不进行反传的
        pass
    else:  # 进行反传的
        requires_grad_num += param.numel()
pct_grad = requires_grad_num / total_params * 100
print(f'当前模型可训练的参数量:{requires_grad_num}, 占总可训练的参数量的{pct_grad}%')

# LoRA初始化
model = get_peft_model(model_base, lora_config)
print('LoRA enabled')
model.print_trainable_parameters()
# 'trainable params: 6,291,456 || all params: 347,539,976 || trainable%: 1.810282682415792'

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


Loading weights from local directory
模型总参数量为：341248520
当前模型可训练的参数量:8200, 占总可训练的参数量的0.002402940824475957%
LoRA enabled
trainable params: 6,291,456 || all params: 347,539,976 || trainable%: 1.810282682415792


In [6]:
print(model)

PeftModel(
  (base_model): LoraModel(
    (model): MOMENTPipeline(
      (normalizer): RevIN()
      (tokenizer): Patching()
      (patch_embedding): PatchEmbedding(
        (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
        (position_embedding): PositionalEmbedding()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=64, bias=False)
          

In [8]:
import torch
def format_size(size):
    # 对总参数量做格式优化
    K, M, B = 1e3, 1e6, 1e9
    if size == 0:
        return '0'
    elif size < M:
        return f"{size / K:.1f}K"
    elif size < B:
        return f"{size / M:.1f}M"
    else:
        return f"{size / B:.1f}B"

def get_pytorch_model_info(model: torch.nn.Module) -> (dict, list):
    """
    输入一个PyTorch Model对象，返回模型的总参数量（格式化为易读格式）以及每一层的名称、尺寸、精度、参数量、是否可训练和层的类别。

    :param model: PyTorch Model
    :return: (总参数量信息, 参数列表[包括每层的名称、尺寸、数据类型、参数量、是否可训练和层的类别])
    """
    params_list = []
    total_params = 0
    total_params_non_trainable = 0

    for name, param in model.named_parameters():
        # 获取参数所属层的名称
        layer_name = name.split('.')[0]
        # 获取层的对象
        layer = dict(model.named_modules())[layer_name]
        # 获取层的类名
        layer_class = layer.__class__.__name__

        params_count = param.numel()
        trainable = param.requires_grad
        params_list.append({
            'tensor': name,
            'layer_class': layer_class,
            'shape': str(list(param.size())),
            'precision': str(param.dtype).split('.')[-1],
            'params_count': str(params_count),
            'trainable': str(trainable),
        })
        total_params += params_count
        if not trainable:
            total_params_non_trainable += params_count

    total_params_trainable = total_params - total_params_non_trainable
    
    total_params_info = {
        'total_params': format_size(total_params),
        'total_params_trainable': format_size(total_params_trainable),
        'total_params_non_trainable': format_size(total_params_non_trainable)
    }

    return total_params_info, params_list

def filter_dic(it):
    ret_list = []
    for tup in it:
        if tup['trainable'] == 'True':
            ret_list.append(tup)
    return ret_list

In [9]:
total_params_info, param_list = get_pytorch_model_info(model)
filter_dic(param_list)

[{'tensor': 'base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight',
  'layer_class': 'LoraModel',
  'shape': '[64, 1024]',
  'precision': 'float32',
  'params_count': '65536',
  'trainable': 'True'},
 {'tensor': 'base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight',
  'layer_class': 'LoraModel',
  'shape': '[1024, 64]',
  'precision': 'float32',
  'params_count': '65536',
  'trainable': 'True'},
 {'tensor': 'base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight',
  'layer_class': 'LoraModel',
  'shape': '[64, 1024]',
  'precision': 'float32',
  'params_count': '65536',
  'trainable': 'True'},
 {'tensor': 'base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight',
  'layer_class': 'LoraModel',
  'shape': '[1024, 64]',
  'precision': 'float32',
  'params_count': '65536',
  'trainable': 'True'},
 {'tensor': 'base_model.model.encoder.block.1.layer.0.SelfAttention.q.lora_A.default.weight',
  

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from tqdm import tqdm

class InformerDataset:
    def __init__(
        self,
        forecast_horizon = 192,
        data_split = "train",
        data_stride_len = 1,
        task_name = "forecasting",
        random_seed = 42,
    ):
        """
        Parameters
        ----------
        forecast_horizon : int
            Length of the prediction sequence.
        data_split : str
            Split of the dataset, 'train' or 'test'.
        data_stride_len : int
            Stride length when generating consecutive
            time series windows.
        task_name : str
            The task that the dataset is used for. One of
            'forecasting', or  'imputation'.
        random_seed : int
            Random seed for reproducibility.
        """

        self.seq_len = 512
        self.forecast_horizon = forecast_horizon
        self.full_file_path_and_name = "../../data/ETTh1.csv"
        self.data_split = data_split
        self.data_stride_len = data_stride_len
        self.task_name = task_name
        self.random_seed = random_seed

        # Read data
        self._read_data()

    def _get_borders(self):
        n_train = 12 * 30 * 24
        n_val = 4 * 30 * 24
        n_test = 4 * 30 * 24

        train_end = n_train
        val_end = n_train + n_val
        test_start = val_end - self.seq_len
        test_end = test_start + n_test + self.seq_len

        train = slice(0, train_end)
        test = slice(test_start, test_end)

        return train, test

    def _read_data(self):
        self.scaler = StandardScaler()
        df = pd.read_csv(self.full_file_path_and_name)
        self.length_timeseries_original = df.shape[0]
        self.n_channels = df.shape[1] - 1

        df.drop(columns=["date"], inplace=True)
        df = df.infer_objects(copy=False).interpolate(method="cubic")

        data_splits = self._get_borders()

        train_data = df[data_splits[0]]
        self.scaler.fit(train_data.values)
        df = self.scaler.transform(df.values)

        if self.data_split == "train":
            self.data = df[data_splits[0], :]
        elif self.data_split == "test":
            self.data = df[data_splits[1], :]

        self.length_timeseries = self.data.shape[0]

    def __getitem__(self, index):
        seq_start = self.data_stride_len * index
        seq_end = seq_start + self.seq_len
        input_mask = np.ones(self.seq_len)

        if self.task_name == "forecasting":
            pred_end = seq_end + self.forecast_horizon

            if pred_end > self.length_timeseries:
                pred_end = self.length_timeseries
                seq_end = seq_end - self.forecast_horizon
                seq_start = seq_end - self.seq_len

            timeseries = self.data[seq_start:seq_end, :].T
            forecast = self.data[seq_end:pred_end, :].T

            return timeseries, forecast, input_mask

        elif self.task_name == "imputation":
            if seq_end > self.length_timeseries:
                seq_end = self.length_timeseries
                seq_end = seq_end - self.seq_len

            timeseries = self.data[seq_start:seq_end, :].T

            return timeseries, input_mask

    def __len__(self):
        if self.task_name == "imputation":
            return (self.length_timeseries - self.seq_len) // self.data_stride_len + 1
        elif self.task_name == "forecasting":
            return (
                self.length_timeseries - self.seq_len - self.forecast_horizon
            ) // self.data_stride_len + 1

In [5]:
import numpy as np
import torch
import torch.cuda.amp
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm

from momentfm.utils.utils import control_randomness
from momentfm.utils.forecasting_metrics import get_forecasting_metrics

# Set random seeds for PyTorch, Numpy etc.
control_randomness(seed=13) 

# Load data
train_dataset = InformerDataset(data_split="train", random_seed=13, forecast_horizon=192)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = InformerDataset(data_split="test", random_seed=13, forecast_horizon=192)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cur_epoch = 0
max_epoch = 1

# Move the model to the GPU
model = model.to(device)

# Move the loss function to the GPU
criterion = criterion.to(device)

# Enable mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Create a OneCycleLR scheduler
max_lr = 1e-4
total_steps = len(train_loader) * max_epoch
scheduler = OneCycleLR(optimizer, max_lr=max_lr, total_steps=total_steps, pct_start=0.3)