# Data

In [None]:
import pandas as pd
import csv

### Sample Generation with `gpt-3.5-turbo`

In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY')
client = OpenAI(api_key=OPEN_AI_KEY)

In [None]:
completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a system that can generate chat prompts that understands values in Chinese communication. This includes indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, as well as humility. "},
    {"role": "system", "content": "Here are 3 examples: Input:\nCONTEXT: You are a caregiver in a nursing home, conversing with an elderly Chinese resident, Mrs. Li, in English. You notice she hasn't finished her food from yesterday.\n Mrs. Li: Hello!\n Response:\n Hello Mrs. Li. Was the food from yesterday to your liking?\n \n Input:\n CONTEXT: You are a hotel receptionist, speaking to an elderly Chinese couple that is checking out. They have not paid yet, but are insisting they have. \n Woman: We would like to check out, we paid yesterday.\n Response:\n I apologize if there's been a misunderstanding. I will double check our system again.\n \n Input:\n CONTEXT: You are conversing with a Chinese coworker to provide feedback on their work, which is a little lacking. \n Coworker: Hello!\n Response:\n Hello, how are you? I've reviewed your work. It's quite comprehensive. Would you like to go over this section though?"},
    {"role": "user", "content": "Give me 40 chat prompts of unique scenarios that exhibit values in Chinese communication, especially the values of indirectness and respect to elders. Use the format of the examples given with the context."}
  ]
)

In [None]:
output = completion.choices[0].message.content

In [None]:
# Write samples to data file (appending)
with open('data/gpt_samples.txt', 'a') as f:
    f.write(output)

### Parse Generated Data

In [None]:
def parse_sample_files(file):
    f = open(file, "r")
    is_question = False
    prompts = []
    ideal = []
    for x in f: 
        x = x.strip()
        if "CONTEXT: " in x:
            i = x.index("CONTEXT: ")
            cur_prompt = x[i+9:]
            is_question = True

        elif is_question:
            cur_prompt += "\n" + x
            is_question = False
            prompts.append(cur_prompt)
        
        elif "Response: " in x:
            resp = x.strip().replace("Response: ","")
            ideal.append(resp)
    f.close()
    return prompts, ideal

In [None]:
# Parse nursing home prompts and general prompts
nh_prompts, nh_ideal = parse_sample_files("data/gpt_samples_nh.txt")
gen_prompts, gen_ideal = parse_sample_files("data/gpt_samples_gen.txt")

# Concatenate prompts
all_prompts = nh_prompts + gen_prompts
all_ideal = nh_ideal + gen_ideal 

In [None]:
# Apply Llama2 prompt template format to all examples

cncomm = []
bos_token = "<s>"
eos_token = "</s>"
sys_info = "You are a system that is able to adapt to different scenarios and provide English responses that are culturally sensitive to values in Chinese communication. These values include indirect communication, which relies on subtlety and hints and context, as well as showing respect in front of elders, and maintaining humility."
for ex in zip(all_prompts,all_ideal): 
    prompt = f"{bos_token}[INST]\n<<SYS>>{sys_info}<</SYS>>\n{ex[0]}\n[/INST]\n{ex[1]}{eos_token}"
    cncomm.append([prompt])

In [None]:
print(cncomm[0])

In [None]:
fields = ["samples"]
with open('data/cn_comm.csv', 'w') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
     
    write.writerow(fields)
    write.writerows(cncomm)