In [1]:
import json
import re
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    BitsAndBytesConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    TrainingArguments
)

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "openlm-research/open_llama_3b_v2"

bin C:\Users\smend\miniconda3\envs\stevens\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [2]:
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

In [3]:
model = LlamaForCausalLM.from_pretrained(
            MODEL_NAME,
            load_in_8bit=False,
            torch_dtype=torch.float16,
            quantization_config=quantization_config,
            device_map="auto",)

In [4]:
tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
model = PeftModel.from_pretrained(
                model,
                "model_dump/checkpoint-500/",
                torch_dtype=torch.float16)

In [6]:
test = pd.read_feather("datasets/movie_datasets/imdb/test_llm_ds_v1.feather")

In [7]:
test.head()

Unnamed: 0,tconst,originalTitle,data,question,answer,prompt
0,tt16252240,The Pitch,Description: A tech geek and a gorgeous secret...,What is the secretary's name?,Unknown,Below is a question regarding movies and shows...
1,tt11172868,Unbreakable,Description: Mariel and Deena have been best f...,What is the movie's genre?,"Comedy, Drama, Romance",Below is a question regarding movies and shows...
2,tt12448312,Posts to the Pope,Description: RTE News asked a range of people ...,What is the name of the person who is committe...,Unanswerable,Below is a question regarding movies and shows...
3,tt11229886,Les Misérables: The Staged Concert,Description: Seen by over 120 million people w...,"Where can you watch ""Les Misérables: The Stage...",cinemas,Below is a question regarding movies and shows...
4,tt11994944,"Plymouth, Michigan - A Rich History","Description: Founded in 1825, the Plymouth com...","What fires are mentioned in ""Plymouth, Michiga...","The Great Fire of 1871, the Plymouth Train Sta...",Below is a question regarding movies and shows...


In [29]:
idx = 450
prompt, answer = test.iloc[idx].prompt, test.iloc[idx].answer
print(prompt)
print(answer)

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: What is the runtime of the movie?
###Input: Description: 23rd installment in the "Yami Douga" series. 
Release Year: 2019 
Runtime(in minutes): 56 
Genre: Horror
###Response:
56 minutes


In [30]:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(DEVICE)

In [31]:
with torch.no_grad():
    generation_output = model.generate(
                    input_ids=input_ids,
                    return_dict_in_generate=True,
                    output_scores=True,
                    max_new_tokens=512,
                )

In [32]:
s = generation_output.sequences[0]
output = tokenizer.decode(s, skip_special_tokens=True)

In [33]:
print(output)

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: What is the runtime of the movie?
###Input: Description: 23rd installment in the "Yami Douga" series. 
Release Year: 2019 
Runtime(in minutes): 56 
Genre: Horror
###Response: 56 minutes
