In [None]:
# ! pip install huggingface_hub



In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, MambaForCausalLM
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm
from hybrid_model import HybridModel
from datasets import load_dataset
# from transformers import MambaModel

  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd


In [2]:
# seed, buffer_size = 42, 10_000
# dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)
# dataset = dataset.shuffle(seed, buffer_size=buffer_size)

In [3]:
transformer_tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M',padding_side="left")
# if transformer_tokenizer.mask_token is None:
#     transformer_tokenizer.add_special_tokens({'mask_token': '[MASK]'})

In [4]:
#dataset = dataset.with_format("torch")
dataset = load_dataset("eli5_category", split="train")

In [7]:
if transformer_tokenizer.pad_token is None:
    #transformer_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    transformer_tokenizer.pad_token = transformer_tokenizer.eos_token

In [7]:
# changed the dataset to eli5  It's an English-language dataset of questions and answers gathered from the 
# r/explainlikeimfive subreddit where users ask factual questions requiring paragraph-length or longer answers.


In [8]:
#train-test split
dataset = dataset.train_test_split(test_size=0.2)

In [9]:
train_dataset = dataset["train"].flatten()

In [10]:
max_seq_length =  100
def tokenize(example):
    return transformer_tokenizer(example, truncation=True, max_length=max_seq_length, padding="max_length")

In [11]:
def preprocess_function(examples):
    result = tokenize([" ".join(x) for x in examples["answers.text"]])
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
tokenized_eli5 = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=train_dataset.column_names
    )

Map (num_proc=4): 100%|██████████| 73417/73417 [00:45<00:00, 1602.21 examples/s]


In [22]:
tokenized_eli5

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 73417
})

In [23]:
from transformers import DataCollatorForLanguageModeling

#tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=transformer_tokenizer, mlm=False)

In [24]:
dataloader = DataLoader(tokenized_eli5, batch_size=8, collate_fn=data_collator,shuffle=True)

In [25]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [26]:
#device = torch.cuda.device(device=device)

In [2]:
device = 'cpu'

In [3]:
transformer_model = AutoModelForCausalLM.from_pretrained(
    'EleutherAI/gpt-neo-125M'
)
mamba_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
transformer_model = transformer_model.to(device)
mamba_model = mamba_model.to(device)

In [29]:
for param in transformer_model.parameters():
    param.requires_grad = False

for param in mamba_model.parameters():
    param.requires_grad = False

# model.to(device) ? unfreeze : not
hybrid_model = HybridModel(transformer_model=transformer_model.transformer, mamba_model=mamba_model.backbone,proj_type= "gressf", n_hybrid_blocks=2).to(device)
trainer_params = []
# for combiner in model.combiners.parameters():
#     trainer_params.append(combiner)

# for spliter in model.splitters.parameters():
#     trainer_params.append(spliter)

for param in hybrid_model.parameters():
    if param.requires_grad:
       trainer_params.append(param)
    
#trainer_params = torch.T(trainer_params)
print(trainer_params[0])
len(trainer_params)

50257
Parameter containing:
tensor([1.3569], device='cuda:0', requires_grad=True)


23

In [30]:
for param in trainer_params:
    print(param.shape)

torch.Size([1])
torch.Size([768, 768])
torch.Size([768])
torch.Size([768, 768])
torch.Size([768])
torch.Size([1])
torch.Size([768, 768])
torch.Size([768])
torch.Size([768, 768])
torch.Size([768])
torch.Size([1])
torch.Size([768, 768])
torch.Size([768])
torch.Size([768, 768])
torch.Size([768])
torch.Size([1])
torch.Size([768, 768])
torch.Size([768])
torch.Size([768, 768])
torch.Size([768])
torch.Size([50257, 768])
torch.Size([768])
torch.Size([768])


In [None]:
hybrid_model.train()
optimizer = torch.optim.AdamW(params=trainer_params, lr=1e-5)
for e in range(50):
    # shuffle the dataset
    for i, batch in enumerate(dataloader):
        #batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        outputs = hybrid_model(input_ids = batch['input_ids'].to(device), attention_mask = batch['attention_mask'].to(device), labels = batch['labels'].to(device))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

loss: 120.59175109863281
loss: 107.79784393310547
loss: 104.14891052246094
loss: 100.83552551269531
loss: 98.62007141113281
loss: 92.268798828125
loss: 92.0339584350586
loss: 88.72013854980469
loss: 86.01180267333984
loss: 84.80916595458984
loss: 82.48648071289062
loss: 79.53721618652344
loss: 78.9300765991211
loss: 76.74812316894531
loss: 75.66899108886719
loss: 71.90830993652344
loss: 73.82843780517578
loss: 71.40892791748047
loss: 69.50786590576172
loss: 66.57719421386719
loss: 67.7821044921875
loss: 65.99663543701172
loss: 66.26202392578125
loss: 64.588134765625
loss: 62.754669189453125
loss: 60.79283905029297
loss: 61.52482223510742
loss: 61.24024200439453
loss: 61.829978942871094
loss: 59.567996978759766
loss: 57.48764419555664
loss: 57.59124755859375
loss: 57.535152435302734
loss: 57.88046646118164
loss: 57.814266204833984
loss: 54.71114730834961
loss: 55.82950973510742
loss: 55.323822021484375
loss: 53.26999282836914
loss: 56.11664962768555
loss: 52.3409423828125
loss: 52.54449

KeyboardInterrupt: 

In [32]:
test_dataset = dataset["test"].flatten()

In [33]:
test = test_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=test_dataset.column_names
    )

Map (num_proc=4): 100%|██████████| 18355/18355 [00:07<00:00, 2513.09 examples/s]


In [34]:
testloader = DataLoader(test, batch_size=8, collate_fn=data_collator,shuffle=True)

In [35]:
hybrid_model.eval()
with torch.no_grad():
    # shuffle the dataset
    for i, batch in enumerate(testloader):
        #batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        outputs = hybrid_model(input_ids = batch['input_ids'].to(device), attention_mask = batch['attention_mask'].to(device), labels = batch['labels'].to(device))
        loss = outputs.loss
        if i % 10 == 0:
            print(f"loss: {loss}")

loss: 11.6471586227417
loss: 11.454437255859375
loss: 12.421030044555664
loss: 11.016474723815918
loss: 11.603528022766113
loss: 11.723335266113281
loss: 9.749794006347656
loss: 12.039562225341797
loss: 11.02593994140625
loss: 10.021510124206543
loss: 11.257247924804688
loss: 12.059006690979004
loss: 9.701502799987793
loss: 11.042990684509277
loss: 10.825369834899902
loss: 9.80688762664795
loss: 11.146143913269043
loss: 10.33370590209961
loss: 10.876644134521484
loss: 10.777077674865723
loss: 10.729930877685547
loss: 10.406472206115723
loss: 10.843860626220703
loss: 11.41571044921875
loss: 10.354085922241211
loss: 11.644628524780273
loss: 11.275474548339844
loss: 9.680427551269531
loss: 10.180423736572266
loss: 12.415082931518555
loss: 10.509671211242676
loss: 10.893714904785156
loss: 10.448331832885742
loss: 10.659329414367676
loss: 10.888725280761719
loss: 10.161733627319336
loss: 10.17746639251709
loss: 9.743611335754395
loss: 11.232806205749512
loss: 10.32297134399414
loss: 11.4154

In [None]:
torch.save(trainer_params, "projector_eli5.pth")

In [37]:
torch.save(hybrid_model.state_dict(), "hybrid_model_eli5.pth")

In [32]:
transformer_tokenizer.eos_token_id

50256

In [None]:
print(len(tokenized_eli5[0]["input_ids"]))
print(len(tokenized_eli5[-1]["input_ids"] ))

100
100


In [4]:
hybrid_model = HybridModel(transformer_model=transformer_model.transformer, mamba_model=mamba_model.backbone,proj_type= "gressf", n_hybrid_blocks=2).to(device)

50257


In [5]:
state_dict_loaded  = torch.load("./hybrid_model_eli5.pth", weights_only=True)

In [6]:
hybrid_model.load_state_dict(state_dict_loaded)

<All keys matched successfully>

In [7]:
# ! pip install ipywidgets

In [None]:
# from huggingface_hub import login
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
hybrid_model.push_to_hub("fahad-touseef/manticore-hybrid-gptneo-mamba")

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fahad-touseef/manticore-hybrid-gptneo-mamba/commit/24480fc634e02953777895713fccc325753549c6', commit_message='Push model using huggingface_hub.', commit_description='', oid='24480fc634e02953777895713fccc325753549c6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/fahad-touseef/manticore-hybrid-gptneo-mamba', endpoint='https://huggingface.co', repo_type='model', repo_id='fahad-touseef/manticore-hybrid-gptneo-mamba'), pr_revision=None, pr_num=None)

In [None]:
hybrid_model.eval()
with torch.no_grad():
    # shuffle the dataset
    for i, batch in enumerate(testloader):
        #batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        outputs = hybrid_model(input_ids = batch['input_ids'].to(device), attention_mask = batch['attention_mask'].to(device), labels = batch['labels'].to(device))
        loss = outputs.loss
        if i % 10 == 0:
            print(f"loss: {loss}")

AttributeError: 'collections.OrderedDict' object has no attribute 'eval'

In [None]:
def tokenize_function(examples):
    return transformer_tokenizer(examples["text"],truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'])
# Create the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=transformer_tokenizer, mlm=False)

# Create the dataloader
dataloader = DataLoader(tokenized_dataset, batch_size=8, collate_fn=data_collator)

In [None]:
for token in tokenized_dataset:
    print(token['input_ids'])
    print(token.keys())
    break

tensor([ 1532,   345,   467,  7374,   736, 41291,    11,  6729,  1477,  2577,
          278,    11,   393,  3817,  5718,  1586,    11,   345,   389,  1016,
          284,   765,   284,  2222,   379,  1551,   530,  5166,   286,   285,
        34978,    13,  2773,   661,   481,   772,  2222,   734, 14729,   287,
         1339,   530,  3011,  9583,    13, 31342,    11,   314,  2222,   257,
         5166,   286,   285, 34978,   329, 23125,   290,   257,  5166,   286,
        18051,   329,  2087, 50003,    13,   198,  2215, 17246,   285, 34978,
          290, 18051,   329,  7374,  3403,    11,   994,   389,   257,  3155,
          286,  1276,    12, 14150,  3033,   326,   345,   815,   804,   329,
           25,   198,  8413, 21985,  9493,   364,    25,   921,   761,   257,
          285,  2621,   393, 29144,  1080,   326,   468,  9493,   364,   290,
          257, 35973,  7582,   523,   345,   836,   447,   247,    83,   625,
        25080,   290, 15488,    13, 19372,   265,   287,  7374, 

  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


In [None]:
# from projector import Combiner
# c = Combiner(10,10).to(device)

# c.in_proj1.weight.device

# cm = torch.nn.ModuleList([Combiner(10, 10) for _ in range(12)]).to(device)




In [None]:

device = 'cuda' if torch.cuda.is_available() else 'cpu' 

for param in transformer_backbone.parameters():
    param.requires_grad = False

for param in mamba_backbone.parameters():
    param.requires_grad = False
    
model = HybridModel(transformer_backbone, mamba_backbone,device) 

# model.to(device) ? unfreeze : not
model.train()

trainer_params = []
for combiner in model.combiners.parameters():
    trainer_params.append(combiner)

for spliter in model.splitters.parameters():
    trainer_params.append(spliter)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)
for e in range(3):
    # shuffle the dataset
    dataset.set_epoch(e)
    for i, batch in enumerate(dataloader):
        if i == 5:
            break
        #batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        outputs = model(input_data = batch['input_ids'])
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

WEIGHT cuda:0


  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
dataset.__getitem__

<bound method Dataset.__getitem__ of IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    n_shards: 23781
})>

In [None]:
#dataloader = DataLoader(tokenized_dataset,batch_size= 32, collate_fn=DataCollatorForLanguageModeling(transformer_tokenizer,mlm=False))




  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  0%|          | 0/5 [00:02<?, ?it/s]

{'language_score': tensor([0.9335, 0.9738, 0.9514, 0.9592, 0.8248, 0.9719, 0.8839, 0.9658, 0.9717,
        0.8748, 0.9487, 0.9108, 0.9771, 0.8554, 0.9519, 0.9758],
       device='cuda:0'), 'token_count': tensor([539, 391, 119, 658,  71, 192, 352, 391, 796, 513, 109, 104, 294, 207,
        194, 629], device='cuda:0')}





RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [None]:
model.splitters[0].out_proj1.weight.device

device(type='cuda', index=0)

In [None]:
model.transformer_model.device

device(type='cuda', index=0)

In [None]:
t['input_ids'].to(device)

tensor([[50256, 50256, 50256,  ...,   285,  2621, 19679],
        [50256, 50256, 50256,  ...,  1708,  6050,    30],
        [50256, 50256, 50256,  ...,   286, 11278,    13],
        ...,
        [50256, 50256, 50256,  ...,    13,  1157, 22199],
        [50256, 50256, 50256,  ...,    13,   447,   251],
        [50256, 50256, 50256,  ...,   329,  3555,     0]], device='cuda:0')

In [None]:

optimizer = torch.optim.AdamW(params=trainer_params , lr=1e-5)
for epoch in range(3):
    dataset.set_epoch(epoch)
    for i, batch in enumerate(tqdm(batched_dataset, total=5)):
        if i == 5:
            break

        #batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        print(batch)
        t = transformer_tokenizer(batch["text"], padding="max_length", truncation=True, return_tensors= "pt")
        outputs = transformer_backbone(t['input_ids'])
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  0%|          | 0/5 [00:02<?, ?it/s]


{'text': ['If you go winter backpacking, snowshoeing, or mountaineering, you are going to want to bring at least one pair of mittens. Some people will even bring two pairs in case one gets wet. Personally, I bring a pair of mittens for warmth and a pair of gloves for added dexterity.\nWhen selecting mittens and gloves for winter conditions, here are a couple of must-have features that you should look for:\nRemovable liners: You need a mitten or glove system that has liners and a removable shell so you don’t overheat and sweat. Sweat in winter is nasty because it freezes in place and chills you, so you have to be extra vigilant about removing layers when you get hot and slow your activity levels to the point where you can stay dry. You can also put removable liners in your sleeping bag at night to dry them.\nWaterproof, breathable shell: When you sweat you need a shell fabric that has the ability to vent moisture and prevent it from getting in. Higher denier (tougher) Gore-Tex shells wo

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)