In [1]:
from adapters import AutoAdapterModel
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LlamaTokenizer, LlamaForCausalLM
from peft import PeftModel
import transformers
import torch
import pandas as pd
from adapters import AdapterConfig


def generate_prompt(instruction: str, input_ctxt: str = None) -> str:
    if input_ctxt:
        return f"""Below is an instruction that describes a task, paired with a quesion. Write a response that appropriately answers the question.
        ### Instruction:
        {instruction}
        ### Input:
        {input_ctxt}
      	### Response:"""
    else:
      return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
    	 ### Instruction:
      {instruction}
       ### Response:"""


def loadDataset():
    test= pd.read_csv("/bigwork/nhwpshaa/Fine-tuning/Alpaca Native/data/test.csv", sep=",", encoding="utf-8")
    dataset = {"test": test }
    return dataset

def generateTrainingInstructions(dataset): #generate instructions specifically for training
    data = {"instruction":[], "input":[], "output":[]}
    for i, record in dataset.iterrows():
        question = record['Question']
        answer = record['Answer']
        if record['Question Type'].lower() == "yes/no question": #if the question is of yes or no type, provide an individualized instruction
            data["instruction"].append(f"Answer the following questions with 'Yes' or 'No'")
        else:
            data["instruction"].append(f"Answer the following questions to the best of your knowledge. Answer the questions as briefly as possible using only one causal sentence.")
        data["output"].append(answer)
        data["input"].append(question)
    return data


dataset = loadDataset()
dataset = generateTrainingInstructions(dataset['test'])
model = LlamaForCausalLM.from_pretrained("/bigwork/nhwpshaa/alpaca-native")
#adapter_config = AdapterConfig.load("pfeiffer")
model.load_adapter("/bigwork/nhwpshaa/Fine-tuned AlpacaNative/Jobs/30 epochs/LoRA config/", adapter_name="adapter1") #load the first adapter as adapter1
model.load_adapter("/bigwork/nhwpshaa/Fine-tuned AlpacaNative/Jobs/30 epochs/LoRA config/secondOrder2", adapter_name="adapter2") #load the second adapter as adapter2
model.enable_adapters() #enable both adapters
model.eval() #change to evaluation mode for inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
tokenizer = LlamaTokenizer.from_pretrained("/bigwork/nhwpshaa/alpaca-native/")
labels= []
preds = []

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

In [None]:
instruction = "Act as an expert debater and generate an essay of around 400 words arguing against the following topic. Structure the essay into well-organized paragraphs and provide evidence to support the claim"
input_ctxt = "Schools should abolish homework"
sequences = pipeline(
    generate_prompt(instruction, input_ctxt),
    min_length=500,
    max_length=900,
    do_sample=True,
    top_k=90,
    num_return_sequences=4,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
