## GPT Sample Generation

In [6]:
from openai import OpenAI
import time
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY')
client = OpenAI(api_key=OPEN_AI_KEY)

In [7]:
completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a system that can generate chat prompts that understands values in Chinese communication. This includes indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, as well as humility. "},
    {"role": "system", "content": "Here are 3 examples: Input:\nCONTEXT: You are a caregiver in a nursing home, conversing with an elderly Chinese resident, Mrs. Li, in English. You notice she hasn't finished her food from yesterday.\n Mrs. Li: Hello!\n Response:\n Hello Mrs. Li. Was the food from yesterday to your liking?\n \n Input:\n CONTEXT: You are a hotel receptionist, speaking to an elderly Chinese couple that is checking out. They have not paid yet, but are insisting they have. \n Woman: We would like to check out, we paid yesterday.\n Response:\n I apologize if there's been a misunderstanding. I will double check our system again.\n \n Input:\n CONTEXT: You are conversing with a Chinese coworker to provide feedback on their work, which is a little lacking. \n Coworker: Hello!\n Response:\n Hello, how are you? I've reviewed your work. It's quite comprehensive. Would you like to go over this section though?"},
    {"role": "user", "content": "Give me 40 chat prompts of unique scenarios that exhibit values in Chinese communication, especially the values of indirectness and respect to elders. Use the format of the examples given with the context."}
  ]
)

In [9]:
output = completion.choices[0].message.content

"1. CONTEXT: You are at a family gathering and your elderly grandmother has cooked a meal for everyone.\nGrandmother: Please, have a taste of my dishes.\nResponse: Grandmother, your cooking is always a delight. I'll be sure to savor every bite.\n\n2. CONTEXT: You are at a meeting with your Chinese boss, who is suggesting changes to your project that you strongly disagree with.\nBoss: I suggest we take a different approach to this project.\nResponse: I appreciate your insights, and I will consider different perspectives moving forward.\n\n3. CONTEXT: You are chatting with a Chinese friend who is feeling unwell but doesn't want to admit it directly.\nFriend: I'm just feeling a bit tired today.\nResponse: I understand, please take care of yourself and get some rest if you need to.\n\n4. CONTEXT: You are discussing a potential business partnership with an elderly Chinese investor who seems hesitant.\nInvestor: I have some concerns about this proposal.\nResponse: I value your insights and t

In [11]:
with open('data/gpt_samples.txt', 'a') as f:
    f.write(output)

## Parse Data 
txt to csv in Llama2 prompt formatting

In [10]:
import pandas as pd
import csv

In [4]:
f = open("examples.txt", "r")

is_question = False
prompts = []
first_line = []
labels = []
type = []
for x in f: 
    if "CONTEXT: " in x:
        ctxt = x.strip()
        i = ctxt.index("CONTEXT: ")
        ctxt = ctxt[i+9:]
        prompts.append(ctxt)
        is_question = True
        type.append("N")

    elif is_question:
        is_question = False
        first_line.append(x.strip())
    
    elif "Response: " in x:
        resp = x.strip().replace("Response: ","")
        labels.append(resp)


In [5]:
f = open("examples2.txt", "r")

for x in f: 
    if "CONTEXT: " in x:
        ctxt = x.strip()
        i = ctxt.index("CONTEXT: ")
        ctxt = ctxt[i+9:]
        prompts.append(ctxt)
        is_question = True
        type.append("G")

    elif is_question:
        is_question = False
        first_line.append(x.strip())
    
    elif "Response: " in x:
        resp = x.strip().replace("Response: ","")
        labels.append(resp)


In [9]:
df_dict = {'context': prompts, 'user_line': first_line, 'ast_line': labels, 'type':type} 
    
df = pd.DataFrame(df_dict)

In [21]:
cncomm = []
sys_info = "You are a system that is able to adapt to different scenarios and provide English responses that are culturally sensitive to values in Chinese communication. These values include indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, and maintaining humility."
bos_token = "<s>"
eos_token = "</s>"
for ind, row in df.iterrows():
    cur_row = '<<SYS>>\n' + sys_info + '\n<</SYS>>\n\n'
    cur_row += bos_token + '[INST] ' +row['context'] + '\n' + row['user_line'] + ' [/INST]'
    cur_row += ' '  + row['ast_line'] + ' ' + eos_token
    cncomm.append([cur_row])

In [22]:
print(cncomm[0])

['<<SYS>>\nYou are a system that is able to adapt to different scenarios and provide English responses that are culturally sensitive to values in Chinese communication. These values include indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, and maintaining humility.\n<</SYS>>\n\n<s>[INST] You are a caregiver in a nursing home, conversing with an elderly Chinese resident, Mr. Wang, who seems a little down today.\nMr. Wang: Good afternoon. [/INST] Good afternoon, Mr. Wang. Is there anything in particular on your mind today? </s>']


In [23]:
fields = ["text"]
with open('cn_comm.csv', 'w') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
     
    write.writerow(fields)
    write.writerows(cncomm)

## LoRA

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets==2.16.0

In [None]:
!pip install transformers==4.38.2
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [1]:
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# from transformers import LlamaForCausalLM, LlamaTokenizer

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Dataset
data_name = "cheungra/CNCOMM-500-1.0"
training_data = load_dataset(data_name, split="train")

In [7]:
# Model and tokenizer names
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-cncomm"
# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

In [8]:
# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1# Model and tokenizer names
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-cncomm"
# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16
# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`