In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import pandas as pd
from tqdm.auto import tqdm

from peft import PeftConfig, PeftModel

Helper method: 

In [2]:
def parse_sample_files(file):
    f = open(file, "r")
    is_question = False
    prompts = []
    ideal = []
    for x in f: 
        x = x.strip()
        if "CONTEXT: " in x:
            i = x.index("CONTEXT: ")
            cur_prompt = x[i+9:]
            is_question = True

        elif is_question:
            cur_prompt += "\n" + x
            is_question = False
            prompts.append(cur_prompt)
        
        elif "Response: " in x:
            resp = x.strip().replace("Response: ","")
            ideal.append(resp)
    f.close()
    return prompts, ideal

## Llama2 via Huggingface: Benchmark Responses

In [3]:
model_id = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model = model.cuda()

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device=0
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [6]:
def get_response(text):
    base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]"
    
    input = base_prompt.format(system_prompt = "You are a system that is able to adapt to different scenarios and provide English responses that are culturally sensitive to values in Chinese communication. These values include indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, and maintaining humility.",
                               user_prompt = text)
    # print(input)
    
    sequences = pipeline(
        input,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=250,
        return_full_text=False,
        temperature=0.5
    )

    output = ""
    for seq in sequences:
        output += seq['generated_text']
    return output

In [7]:
# Load Benchmark Samples:
bm_prompts, bm_ideals = parse_sample_files("data/benchmark_samples.txt")

In [None]:
bm_data = []
prompt_id = 1
for sample in tqdm(zip(bm_prompts, bm_ideals)):
    input = sample[0]
    ideal_output = sample[1]
    for _ in range(3):
        cur_sample = dict()

        output = get_response(input)
        cur_sample['prompt_id'] = prompt_id
        cur_sample['prompt'] = input
        cur_sample['output'] = output
        cur_sample['ideal_output'] = ideal_output
        bm_data.append(cur_sample)
        print(cur_sample )
    prompt_id += 1

In [45]:
bm_df = pd.DataFrame().from_dict(bm_data) 
bm_df.to_csv("data/bm_10.csv",index=False)

## Testing llama-2-7b-cncomm

#### Load Adapter via PeftModel

In [5]:
model_id = "meta-llama/Llama-2-7b-chat-hf"
adapter_model_id = "cheungra/llama-2-7b-cncomm"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_id)
ft_model = PeftModel.from_pretrained(model, adapter_model_id)

In [8]:
ft_model = ft_model.merge_and_unload()

In [9]:
ft_model = ft_model.cuda()

### Test Benchmark Samples


In [10]:
pipeline = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device=0
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [11]:
def get_response(text):
    base_prompt = "<s>[INST]\n<<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt}[/INST]"
    
    input = base_prompt.format(system_prompt = "You are a system that is able to adapt to different scenarios and provide English responses that are culturally sensitive to values in Chinese communication. These values include indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, and maintaining humility.",
                               user_prompt = text)
    # print(input)
    
    sequences = pipeline(
        input,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=250,
        return_full_text=False,
        temperature=0.5
    )

    output = ""
    for seq in sequences:
        output += seq['generated_text']
    return output

In [12]:
# Load Benchmark Prompts:
bm_prompts, bm_ideals = parse_sample_files("data/benchmark_samples.txt")

In [13]:
cncomm_bm_data = []

In [None]:
prompt_id = 1
for sample in tqdm(zip(bm_prompts, bm_ideals)):
    input = sample[0]
    ideal_output = sample[1]
    for _ in range(3):
        cur_sample = dict()

        output = get_response(input)
        cur_sample['prompt_id'] = prompt_id
        cur_sample['prompt'] = input
        cur_sample['output'] = output
        cur_sample['ideal_output'] = ideal_output
        cncomm_bm_data.append(cur_sample)
        print(cur_sample)
    prompt_id += 1

In [55]:
bm_df = pd.DataFrame().from_dict(cncomm_bm_data) 
bm_df.to_csv("data/cncomm_bm_10.csv",index=False)