In [None]:
from huggingface_hub import notebook_login
import os
#os.environ["CUDA_LAUNCH_BLOCKING"]="1"
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from utils import *
import gc

model_name = "gpt2-large"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

ltype = "curlora"
for param in model.parameters():
        param.requires_grad = False

for name, module in model.named_modules():
    if isinstance(module, type(model.transformer.h[0].attn)):
        if ltype == "lora":
            module.c_attn = LinearWithLoRA(module.c_attn, 8, 1)
        else:
            module.c_attn = LinearWithCURLoRA(module.c_attn, 8, 1)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

model.to(device)
    
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))
    
torch.manual_seed(1311)
num_classes = 2
lm_head = model.lm_head
in_features=1280
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes)
model.to(device)

In [None]:
torch.cuda.empty_cache()
_ = gc.collect()

In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(mrpc_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

In [None]:
# Convert to PyTorch datasets
train_dataset = mrpc_dataset["train"]
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence1"], batch["sentence2"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

In [None]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

In [None]:
mrpc_head = model.lm_head

In [None]:
torch.manual_seed(1311)

num_classes = 2
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(sst_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

In [None]:
# Convert to PyTorch datasets
train_dataset = sst_dataset["train"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, 5000, batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

In [None]:
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

In [None]:
torch.cuda.empty_cache()
_ = gc.collect()

In [None]:
sst_head = model.lm_head
model.lm_head = mrpc_head

In [None]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

In [None]:
torch.manual_seed(1311)

num_classes = 3
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 5
num_training_steps = num_epochs * len(sentiment_dataset["test"])

In [None]:
# Convert to PyTorch datasets
train_dataset = sentiment_dataset["test"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["sentiment"]).to(device) // 4
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

In [None]:
print("Evaluating on Sentiment140...")
sentiment_accuracy = evaluate_sentiment(model, tokenizer, sentiment_dataset, device)
print(f"Sentiment Accuracy: {sentiment_accuracy:.4f}")

In [None]:
sentiment_head = model.lm_head
sentiment_head

In [None]:
model.lm_head = mrpc_head
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

In [None]:
model.lm_head = sst_head
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

In [None]:
torch.cuda.empty_cache()
_ = gc.collect()

In [None]:
model.lm_head = lm_head
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))

In [None]:
text = "every effort moves you"
input_ids = tokenizer.encode(text, return_tensors="pt").to(0)

output = model.generate(input_ids, do_sample=False, max_length=500)
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
torch.cuda.empty_cache()
_ = gc.collect()