In [1]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [2]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [3]:
import torch 
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(1) # using cuda:1
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"INFO: gpu memory occupied: {info.used // 1024 ** 2} MB")

def initialize_gpu_environment():
    torch.cuda.set_device(device)
    print(f"INFO: gpu environment set to {torch.cuda.current_device()}")
    print(f"INFO: device name {torch.cuda.get_device_name()}")
    print(f"INFO: device capability {torch.cuda.get_device_capability()}")
    print_gpu_utilization()

def cleanup_the_mess():
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    
    try: 
        del model
    except NameError as e:
        print(f"ERROR: {e}")

In [11]:
model_id = "google/gemma-2b-it"
models_dir = "/data/sumanth/models"
device = "cuda"

In [12]:
import torch
import os 

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(f"{models_dir}/{model_id}", token=os.environ['HUGGING_FACE_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(f"{models_dir}/{model_id}", quantization_config=bnb_config, device_map="auto", token=os.environ['HUGGING_FACE_TOKEN'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
text = "Quote: Imagination is more"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Imagination is more than just a dream. It's the blueprint for a better future." - Buck Brann




In [14]:
from datasets import load_dataset

data = load_dataset("json", data_files="../utils/testrun.jsonl")
data = data.map(lambda samples: tokenizer(samples["response"]), batched=True)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['system_prompt', 'question', 'response', 'input_ids', 'attention_mask'],
        num_rows: 91
    })
})

In [26]:
import transformers
from trl import SFTTrainer

def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"""<start_of_turn>user
        {example['system_prompt'][i]} {example['question'][i]}<end_of_turn>
        <start_of_turn>model
        {example['response'][i]}<end_of_turn>
        """
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)
trainer.train()

Step,Training Loss
1,2.4513
2,2.4421
3,2.3491
4,2.1162
5,1.9227
6,1.7478
7,1.6491
8,1.5347
9,1.5451
10,1.3723


TrainOutput(global_step=20, training_loss=1.4931232810020447, metrics={'train_runtime': 22.3928, 'train_samples_per_second': 3.573, 'train_steps_per_second': 0.893, 'total_flos': 978955586764800.0, 'train_loss': 1.4931232810020447, 'epoch': 16.0})

In [27]:
# read in the prompt files
SYSTEM_PROMPT = "../prompts/zeroshot-oss-vanilla-system-prompt.txt"
USER_PROMPT = "../prompts/zeroshot-vanilla-user-prompt-template.txt"

system_prompt = str()
with open(SYSTEM_PROMPT, "r") as file:
  system_prompt = file.read()

user_prompt = str()
with open(USER_PROMPT, "r") as file:
  user_prompt = file.read()

topics = """
- Logistic Regression
- The sigmoid function
- Classification with Logistic Regression
- Multinomial logistic regression
"""

user_prompt += topics

In [28]:
user_prompt

'Create a multiple choice question (MCQ) and solution that covers one or more of the following topics:\n\n- Logistic Regression\n- The sigmoid function\n- Classification with Logistic Regression\n- Multinomial logistic regression\n'

In [33]:
text = f"""<start_of_turn>user
        {system_prompt} {user_prompt}<end_of_turn>
        <start_of_turn>model
        """
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=4096, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
        You are a lecturer for an advanced undergraduate natural language processing course.
Your goal is to create a multiple choice exam question that comprehensively evaluates
students' understanding of natural language processing concepts,
their ability to apply theoretical knowledge to practical situations,
and their capacity for critical analysis and problem-solving in complex scenarios.

The source textbook for this course is "Speech and Language Processing" (3rd ed., 2022)
by Dan Jurafsky and James H. Martin. However, the questions should go beyond the scope
of the textbook and require a synthesis of ideas from various sources, including research
papers, lectures, and other supplementary materials covered in the course.

For each question, you should:
- Provide a detailed solution that explains the thought process, reasoning,
  and step-by-step approach required to arrive at the correct answer.
- The solution should demonstrate a deep understanding of the underlying
  conc