In [None]:
!nvidia-smi

Mon Dec  4 01:46:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq transformers datasets peft sentencepiece accelerate bitsandbytes --progress-bar off

[0m

In [5]:
import json
import re
from pprint import pprint

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, get_peft_model_state_dict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq
)

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def create_model_and_tokenizer():
  compute_dtype = getattr(torch, "bfloat16")
  bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=compute_dtype,
          bnb_4bit_use_double_quant=True,
  )


  model = AutoModelForCausalLM.from_pretrained(
      MODEL_NAME,
      # use_safetensors=True,
      # load_in_8bit=True,
      # torch_dtype=torch.float16,
      # trust_remote_code=True,
      quantization_config=bnb_config,
      device_map="auto",
  )

  # model.to(DEVICE)

  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  tokenizer.pad_token = tokenizer.unk_token
  tokenizer.pad_token_id =  tokenizer.unk_token_id
  # tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
  tokenizer.padding_side = "left"

  return model, tokenizer

In [None]:
notebook_login()

In [None]:
model, tokenizer = create_model_and_tokenizer()

In [None]:
def tokenize_prompt(prompt, add_eos_token=True) -> dict:

  result = tokenizer(
        prompt,
        truncation=True,
        max_length=1024,
        padding=False,
        return_tensors=None,
    )
  if (
      result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < 1024
      and add_eos_token
  ):
      result["input_ids"].append(tokenizer.eos_token_id)
      result["attention_mask"].append(1)
  result["labels"] = result["input_ids"].copy()

  # result["input_ids"] = torch.from_numpy(np.array(result["input_ids"])).to(DEVICE)
  # result["attention_mask"] = torch.from_numpy(np.array(result["attention_mask"])).to(DEVICE)
  # result["labels"] = torch.from_numpy(np.array(result["labels"])).to(DEVICE)

  return result


In [2]:
train_data = pd.read_csv("/content/drive/Othercomputers/My Laptop/Desktop/Stevens/SEM1/Deep Learning - CS 583 A/CS 583 Project/datasets/movie_datasets/imdb/train_llm_ds_sm_v2.csv")
val_data = pd.read_csv("/content/drive/Othercomputers/My Laptop/Desktop/Stevens/SEM1/Deep Learning - CS 583 A/CS 583 Project/datasets/movie_datasets/imdb/val_llm_ds_sm_v2.csv")

NameError: ignored

In [3]:
train_data.dropna(axis=0, inplace=True)
val_data.dropna(axis=0, inplace=True)

NameError: ignored

In [None]:
train_ds = Dataset.from_pandas(train_data)
val_ds = Dataset.from_pandas(val_data)

In [None]:
train_ds = train_ds.map(lambda sample: tokenize_prompt(sample["prompt"]))
val_ds = val_ds.map(lambda sample: tokenize_prompt(sample["prompt"]))

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

ValueError: ignored

In [None]:
model

In [None]:
model.config.use_cache = False
model.config.quantization_config.to_dict()

In [None]:
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
    "k_proj",
    "o_proj",
    "gate_proj",
    "down_proj",
    "up_proj"
]

BATCH_SIZE = 128
MICRO_BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 2e-4
# TRAIN_STEPS = 100
OUTPUT_DIR = "experiments"

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=MICRO_BATCH_SIZE,
    lr_scheduler_type="linear",
    # max_steps=1000,
    optim="paged_adamw_8bit",
    logging_steps=1,
    learning_rate=LEARNING_RATE,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=20,
    save_steps=20,
    warmup_ratio=0.05,
    save_strategy="steps",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    seed=42,
    load_best_model_at_end=True,
    num_train_epochs=1,
    neftune_noise_alpha=0.1,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    max_length=1024,
    pad_to_multiple_of=8,
    return_tensors="pt",
    padding=True,

)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    args=training_arguments,
    data_collator=data_collator,
    # neftune_noise_alpha=0.1,
)
model.config.use_cache = False

In [None]:
with torch.autocast("cuda"):
  trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
10,1.2314,1.365619
20,1.4524,1.151991
30,1.0636,1.113009




In [None]:
model.save_pretrained("/content/drive/Othercomputers/My Laptop/Desktop/Stevens/SEM1/Deep Learning - CS 583 A/CS 583 Project/model_dump/llama-7b-v1/", safe_serialization=False)

In [None]:
!cp -r experiments/ /content/drive/MyDrive/Text\ Mining\ and\ Information\ Retrieval\ -\ CS\ 589\ A/

NotImplementedError: ignored

In [None]:
!ls

drive  experiments  sample_data


In [None]:
model.save_pretrained("/content/drive/MyDrive/Text Mining and Information Retrieval - CS 589 A/meta-llama-Llama-2-7b-hf/", safe_serialization=False)

In [None]:
test_data = pd.read_csv("/content/drive/Othercomputers/My Laptop/Desktop/Stevens/SEM1/Deep Learning - CS 583 A/CS 583 Project/datasets/movie_datasets/imdb/test_llm_ds_vsm_v2.csv")

In [None]:
idx = 14
prompt = test_data.iloc[idx].prompt
print(prompt)

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: What is the main conflict in the movie “Ang henerasyong sumuko sa love”?
###Input: Description: It tells the story of different teenagers who have different priorities and encounter different challenges as they enter adult life. 
Release Year: 2019 
Runtime(in minutes): 100 
Genre: Drama,Romance 
Rating: 6.5 
Votes: 71.0
###Response:


In [None]:
prompt = prompt.split("###Response:")[0]+"###Response:"
prompt = prompt.strip()
print(prompt)

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: What is the main conflict in the movie “Ang henerasyong sumuko sa love”?
###Input: Description: It tells the story of different teenagers who have different priorities and encounter different challenges as they enter adult life. 
Release Year: 2019 
Runtime(in minutes): 100 
Genre: Drama,Romance 
Rating: 6.5 
Votes: 71.0
###Response:


In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(DEVICE)

In [None]:
with torch.no_grad():
    generation_output = trainer.model.generate(
                    input_ids=input_ids,
                    return_dict_in_generate=True,
                    output_scores=True,
                    max_new_tokens=512,
                )
s = generation_output.sequences[0]
output = tokenizer.decode(s, skip_special_tokens=True)

In [None]:
print(output)

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: What is the main conflict in the movie “Ang henerasyong sumuko sa love”?
###Input: Description: It tells the story of different teenagers who have different priorities and encounter different challenges as they enter adult life. 
Release Year: 2019 
Runtime(in minutes): 100 
Genre: Drama,Romance 
Rating: 6.5 
Votes: 71.0
###Response: The main conflict in the movie “Ang henerasyong sumuko sa love” is the conflict between the teenagers’ priorities and the challenges they encounter as they enter adult life.


In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/Shaun/openlm-research-open_llama_3b_v2-pre/", save_adapter=True, save_config=True)

In [None]:
trainer.save_model("/content/drive/MyDrive/Shaun/openlm-research-open_llama_3b_v2/")