In [1]:
import sys

import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer


  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /common/home/jj635/anaconda3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /common/home/jj635/anaconda3/envs/llama/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [3]:
try:
    if torch.backends.mps.is_available():
        device = "mps"
except:  # noqa: E722
    pass


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

load_8bit: bool = False,
base_model = '/common/users/jj635/llama/llama-7b/'
lora_weights = '/common/users/jj635/llama/standard/'
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=load_8bit,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(
    model,
    lora_weights,
    torch_dtype=torch.float16,
)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [00:32<00:00,  1.00it/s]


In [5]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)

In [6]:
def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f""" # noqa: E501
{data_point["instruction"]}

### input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.  # noqa: E501

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""
    
    
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    user_prompt = generate_prompt({**data_point, "output": ""})
    tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])

    tokenized_full_prompt["labels"] = [
        -100
    ] * user_prompt_len + tokenized_full_prompt["labels"][
        user_prompt_len:
    ]  # could be sped up, probably
    return tokenized_full_prompt


def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    cutoff_len = 1024
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result


In [7]:
import json
import random

from datasets import load_dataset
#with open("movie.json",'r', encoding='UTF-8') as f:
#    data = json.load(f)

generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=10,
    num_return_sequences=10,
)
data = load_dataset('./',data_files="testset.json")
print(data)

#temp = data['train'].shuffle().map(generate_and_tokenize_prompt)

Found cached dataset json (/common/home/jj635/.cache/huggingface/datasets/json/.-8b4951d6efb741c8/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 89.92it/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 162541
    })
})





In [9]:
from torch.utils.data import DataLoader, Sampler
from tqdm import tqdm
#test = DataLoader(data['train'],batch_size=4)
#print(data['train'][0])
#test = generate_and_tokenize_prompt(data['train'][0])
"""
test =generate_prompt({**data['train'][0], "output": ""})
print(test)
inputs = tokenizer(test, return_tensors="pt")
#input_ids = test['input_ids']
input_ids = inputs['input_ids'].to('cuda')
print(input_ids)
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )
"""  
for i, cur in tqdm(enumerate(data['train'])):
    label = cur['output']
    print(label)
    inputs = generate_prompt({**cur, "output": ""})
    inputs = tokenizer(inputs, return_tensors="pt")
    input_ids = inputs['input_ids'].to('cuda')
    print(input_ids)
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=128,
        )
    res = []
    for i in range(10):
        temp = generation_output.sequences[i]
        cur = tokenizer.decode(temp,skip_special_tokens=True).split("### Response:")[1].strip()
        cur = cur.split("⁇")[0].strip()
        res.append(cur)
    print(res)
    print(label in res)
#for i, batch in tqdm(enumerate(data['train'])):
#    cur = generate_and_tokenize_prompt(batch)
    #print(generate_and_tokenize_prompt[batch[0]])

0it [00:00, ?it/s]

Eternal Sunshine of the Spotless Mind (2004)
tensor([[    1, 29871,   396,   694, 25621, 29901,   382, 29945, 29900, 29896,
            13, 15156,   278,  4940, 14064, 19995, 29892,   508,   366,  8500,
           278,  2446,   896,   674,  5517,  6505, 29973,    13,    13,  2277,
         29937,  1881, 29901,    13, 14959,   472,  2191,   313,  1523,  1004,
          1597,  1967, 29897,   313, 29906, 29900, 29900, 29946,   511, 29925,
          3328,  1920, 11665, 29892,   450,   313,  5661, 16782,  2488, 29897,
           313, 29906, 29900, 29900, 29896,   511,  1204, 29875,  1862, 29892,
           450,   313,  1204, 24414, 11795, 29897,   313, 29896, 29929, 29929,
         29947,   511, 29911, 20593, 29871, 29906,   313, 29906, 29900, 29900,
         29900,   511, 13695, 29939, 12602, 29939,   271,  1039,   313, 29906,
         29900, 29900, 29906,   511,  7129,   291,   310,   278,  2819, 29892,
           450,   313, 29906, 29900, 29900, 29946,   511, 29925, 15323,   383,
       

0it [00:01, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 6; 23.69 GiB total capacity; 4.00 GiB already allocated; 12.56 MiB free; 4.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print(len(data['train']))

In [None]:
out = tokenizer.decode(generation_output.sequences[1],skip_special_tokens=True).split("### Response:")[1].strip()
print(out)
res = []
for i in range(10):
    temp = generation_output.sequences[i]
    cur = tokenizer.decode(temp,skip_special_tokens=True).split("### Response:")[1].strip()
    cur = cur.split("⁇")[0].strip()
    res.append(cur)
print('Delicatessen (1991)' in res)

In [None]:
#input_ids = inputs["input_ids"].to('cuda')
input_ids = inputs["input_ids"].to('cuda')
print(input_ids)

In [None]:
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )

In [None]:
tokenizer.decode(generation_output.sequences[0])

In [None]:
def generate(data):
    print(data)
data = {'a':'a','b':'b'}
generate({**data,'b':''})

In [None]:
generation_config['num_beams']