In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import numpy as np

import torch
import torch.nn.functional as F

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [3]:
# Load the model
from model import Mamba, ModelArgs
from transformers import AutoTokenizer

from S5.dataloading import create_wikitext_dataset

pretrained_model_name_list = ["state-spaces/mamba-2.8b-slimpj", "state-spaces/mamba-2.8b", "state-spaces/mamba-1.4b", "state-spaces/mamba-790m", "state-spaces/mamba-370m", "state-spaces/mamba-130m"]
# pretrained_model_name = pretrained_model_name_list[-1]
# pretrained_model_name = pretrained_model_name_list[-2]
pretrained_model_name = pretrained_model_name_list[-3]
print("The pretrained model I'm using is:", pretrained_model_name)

model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

# Get the data
# from datasets import load_dataset
# dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
# text = dataset["train"][3]

loader_names = ["val", "test", "train"]
ppl_over_loaders = {"val":[], "test":[], "train":[]}

# l_max_power_range = 16 # 120M
l_max_power_range = 15 # 370M or 790M
# l_max_power_range = 14 # up to 2.8B

for l_max_power in range(4, l_max_power_range+1): # 16->32768
    l_max = 2 ** l_max_power

    config = {
        "l_max": l_max, 
        "data_dir": "./",
        "batch_size": int(2**l_max_power_range / l_max),
        "batch_size_eval": int(2**l_max_power_range / l_max), 
        "num_workers": 4, 
        "pin_memory": False, 
        "tokenizer": "EleutherAI/gpt-neox-20b",
    }
    assert config["batch_size"] > 0, "batch_size must be positive"

    train_loader, val_loader, test_loader = create_wikitext_dataset(config)

    # Evaluate the perplexity
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Plan
    total_loss = 0.0
    total_tokens = 0

    loaders = [val_loader, test_loader, train_loader]

    for loader, loader_name in zip(loaders, loader_names):
        if loader_name == "train":
            continue
        # print("loader_name is ", loader_name)
        # Iterate through the test data loader
        for batch in loader:
            input_ids = batch[0].to(device)  # Move input to GPU
            output_ids = batch[1].to(device)  # Move output to GPU
            
            # Forward pass to get the logits
            with torch.no_grad():
                logits = model(input_ids)
            
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), output_ids.view(-1), reduction='sum')
            
            # Update the total loss and token count
            total_loss += loss.item()
            total_tokens += input_ids.numel()

            perplexity = np.exp(total_loss / total_tokens)
        ppl_over_loaders[loader_name].append(perplexity)

            # print(f"Running l_max = {l_max}\n  perplexity: {perplexity:.2f}\n  total loss is {total_loss}\n  total tokens is {total_tokens}")
for loader_name in loader_names:
    print(f"loader_name is {loader_name}, ppl_over_loaders is {ppl_over_loaders[loader_name]}")

The pretrained model I'm using is: state-spaces/mamba-790m


config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.17G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
[*] Creating wikitext-103 Dataset
loader_name is val, ppl_over_loaders is [139.7777302648935, 77.87176517564225, 49.15296807324841, 35.92156886417661, 29.673878463686194, 26.759916905323472, 25.512165196557973, 24.873523371930595, 24.655187453214698, 25.111255620856603, 31.407989949322545, 79.33681365279371]
loader_name is test, ppl_over_loaders is [139.84366749627696, 77.13766870371919, 48.4299414388772, 35.29418653114845, 29.17634450252662, 26.310642725285618, 25.007266227870147, 24.37497590900105, 24.156721249503253, 24.630436824077655, 30.78681211981892, 79.74084877967272]
loader_name is train, ppl_over_loaders is 

In [4]:
import pandas as pd
# store the data into csv

df = pd.DataFrame(ppl_over_loaders, columns = ['val', 'test'])
df["length"] = [2 ** aha for aha in range(4, l_max_power_range+1)]
# print(f"ppl_wikitext_{pretrained_model_name[13:]}.csv")
df.to_csv(f"ppl_wikitext_{pretrained_model_name[13:]}.csv", index=False)