In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

models_dir = "/data/sumanth/models"
device = "cuda:1"

model_id = "google/gemma-7b-it"

In [13]:
import torch 
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(1) # using cuda:1
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"INFO: gpu memory occupied: {info.used // 1024 ** 2} MB")

def initialize_gpu_environment():
    torch.cuda.set_device(device)
    print(f"INFO: gpu environment set to {torch.cuda.current_device()}")
    print(f"INFO: device name {torch.cuda.get_device_name()}")
    print(f"INFO: device capability {torch.cuda.get_device_capability()}")
    print_gpu_utilization()

def cleanup_the_mess():
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    
    try: 
        del model
    except NameError as e:
        print(f"ERROR: {e}")

In [14]:
initialize_gpu_environment()

INFO: gpu environment set to 1
INFO: device name NVIDIA L40
INFO: device capability (8, 9)
INFO: gpu memory occupied: 2006 MB


In [4]:
tokenizer = AutoTokenizer.from_pretrained(f"{models_dir}/{model_id}")
model = AutoModelForCausalLM.from_pretrained(f"{models_dir}/{model_id}", device_map=device)

input_text = "Give me a mutliple choice question on NLP for advanced undergraduate students."
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
outputs = model.generate(**input_ids, max_length=1000)
print(tokenizer.decode(outputs[0]))

<bos>Give me a mutliple choice question on NLP for advanced undergraduate students.

Sure, here is the question:

In NLP, which technique is commonly used to identify the sentiment of a text document?

a) Lexical Analysis
b) Part-of-Speech Tagging
c) Named Entity Recognition
d) Sentiment Analysis

The answer is d) Sentiment Analysis.<eos>


In [None]:
template = """
You are a friendly chatbot assistant that responds conversationally to users' questions.
Keep the answers short, unless specifically asked by the user to elaborate on something.

Question: {question}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [11]:
cleanup_the_mess()

ERROR: cannot access local variable 'model' where it is not associated with a value


In [10]:
del model