## Training and Validation Set data preparation

In [1]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [2]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

### Loading the RAW Dataset

In [3]:
df = pd.read_csv("rogers_data/dataset.csv", usecols=['company_key', 'message_order', 'case_id', 'text', 'response_text'])

In [4]:
df.columns

Index(['company_key', 'case_id', 'text', 'response_text', 'message_order'], dtype='object')

In [5]:
len(df["case_id"].unique().tolist())

57078

### Grouping by the conversation ids

In [6]:
nf = df.groupby(by=["case_id"], group_keys=False).apply(lambda x: x)

In [7]:
nf.dropna(subset=['text', 'response_text'], how='all', inplace=True)

nf['MessageDiff'] = nf.groupby('case_id')['message_order'].diff()

In [8]:
nf.head()

Unnamed: 0,company_key,case_id,text,response_text,message_order,MessageDiff
0,rogers,10617686,@Rogers @top_employers @RogersCareers Please p...,,1,
2,rogers,10617686,,We appreciate you taking the time to bring you...,6,5.0
3,rogers,16892578,This shirt speaks volumes. 🎤 You saw the power...,,1,
4,rogers,16892578,Hey! Guess what? Fido’s on TikTok! Something b...,,3,2.0
5,rogers,16892578,🏳️‍⚧️Today is #TransDayOfVisibility! We’re pro...,,5,2.0


### Extract conversations from the RAW Dataset and preparing the prompts according to Chat Template of LLMs

In [9]:
def extract_conversation(df):
    conv = {"customer": [], "agent": []}
    order = 0
    for index, col in df.iterrows():
        text = col["text"]
        response = col["response_text"]
        if type(text) == float:
            resp = response, order
            conv["agent"].append(resp)
        if type(response) == float:
            conv["customer"].append(text)
            order += 1
    return conv


def prepare_prompt(conversation: dict):
    messages = []
    count = 0
    for user_message in conversation["customer"]:
        count += 1
        cust_dict = {"role": "user", "content": user_message}
        messages.append(cust_dict)
        for response in conversation["agent"]:
            if response[1] == count:
                response_dict = {"role": "assistant", "content": response[0]}
                messages.append(response_dict)
    return messages


def extract_and_prepare_text(data_df, conversation_ids):
    prompts, conv_id = [], []
    for conversation_id in conversation_ids:
        sample = data_df[data_df["case_id"] == conversation_id]
        conversation = extract_conversation(sample)
        instruction = prepare_prompt(conversation)
        prompts.append(instruction)
        conv_id.append(conversation_id)
    return prompts, conv_id


def prepare_df_from_prompts(conv_id, prompts):
    data_dict = {"conversation_id": conv_id, "instruction": prompts}
    processed_df = pd.DataFrame(data_dict)
    return processed_df

In [10]:
nf_conv_ids = nf["case_id"].unique().tolist()

prompts, convs = extract_and_prepare_text(nf, nf_conv_ids)

In [11]:
system_prompt = {"role": "system",
                 "content": "You are a helpful assistant. Your task is to help an assistant to write responses quickly for a customer-agent conversation based on the coversation history"}

for prompt in prompts:
    prompt.insert(0, system_prompt)

formatted_prompts = []
for prompt in prompts:
    new = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=False)
    if new.rfind("<|assistant|>") != -1:
        formatted_prompts.append(new)

### Split the data into train, validation and test sets

In [12]:
train_data, test_ds = train_test_split(formatted_prompts, shuffle=False, test_size=0.1, random_state=42)
train_ds, val_ds = train_test_split(train_data, shuffle=False, test_size=0.1, random_state=42)
print(f"Train dataset {len(train_ds)}, and Validation dataset {len(val_ds)}, and Test dataset: {len(test_ds)}")

Train dataset 25419, and Validation dataset 2825, and Test dataset: 3139


### Load the formatted prompts into Dataframe

In [13]:
train_df = pd.DataFrame(train_ds, columns=["text"])
val_df = pd.DataFrame(val_ds, columns=["text"])

### Save the train, test and validation sets as CSV files

In [14]:
train_df = train_df[10:]

train_df.to_csv("rogers_data/rogers_train_df.csv", index=False)
val_df.to_csv("rogers_data/rogers_val_df.csv", index=False)

## Test data preparation

In [19]:
save_prompt, targets = [], []
for test_prompt in test_ds:
    find = test_prompt.find("<|assistant|>")
    rfind = test_prompt.rfind("<|assistant|>")
    if find == rfind:
        idx = find + len("<|assistant|>")
        test_text = test_prompt[:idx]
        save_prompt.append(test_text)
        targets.append(test_prompt[idx+1:])
    elif rfind > find:
        idx = rfind + len("<|assistant|>")
        test_text = test_prompt[:idx]
        save_prompt.append(test_text)
        targets.append(test_prompt[idx+1:])

In [20]:
test_data = {"text": save_prompt, "targets": targets}

In [21]:
test_df = pd.DataFrame(test_data)

In [22]:
test_df.head()

Unnamed: 0,text,targets
0,<|system|>\nYou are a helpful assistant. Your ...,"With the 'Silence Unknown Callers' feature, it..."
1,<|system|>\nYou are a helpful assistant. Your ...,I do see the request has been sent! The number...
2,<|system|>\nYou are a helpful assistant. Your ...,I can go ahead and document this offer on your...
3,<|system|>\nYou are a helpful assistant. Your ...,You're truly welcome <PERSON>. <PERSON> happy ...
4,<|system|>\nYou are a helpful assistant. Your ...,"The pleasure was all mine! Please, don't hesit..."


In [23]:
# Save the test set to a csv file

test_df.to_csv("rogers_data/rogers_test_df.csv", index=False)

In [None]:
# def format_prompt(customer, agent):
#     system_prompt = """You are a text completion assistant. \
#     Given the conversation between Human and Assistant your task is to help assistant to complete his response."""
#     prompt = "<s>"
#     prompt += f"[INST] {system_prompt} "
#     i = 0
#     for cust, agnt in zip(customer, agent):
#         if i != 0 and type(cust) == str and cust != '':
#             prompt += f"[INST] {cust} [/INST]"
#             prompt += f" {agnt} </s>"
#         if i == 0:
#             prompt += f"{cust} [/INST]"
#             prompt += f" {agnt} </s>"
#         i += 1
#     return prompt


# def format_new_prompt(customer, agent):
#     prompt = ""
#     for cust, agnt in zip(customer, agent):
#         if type(cust) == str and cust != ' ':
#             prompt += f"### Human: {cust}."
#             prompt += f"### Assistant: {agnt}."
#     return prompt

# def prepare_test_chat_template(chat_history):
#     chat = []
#     convs = {"history": [], "target_utterance": []}
#     i = 0
#     for user, assistant in zip(chat_history["customer"], chat_history["agent"]):
#         if i != 0 and type(user) == str and user == str(""):
#             chat.append({"role": "user", "content": user.strip(" ")})
#             chat_template = tokenizer.apply_chat_template(chat, tokenize=False)
#             chat.append({"role": "assistant", "content": assistant})
#             convs["history"].append(chat_template)
#             convs["target_utterance"].append(assistant)
#         if i == 0:
#             chat.append({"role": "user", "content": user.strip(" ")})
#             chat_template = tokenizer.apply_chat_template(chat, tokenize=False)
#             chat.append({"role": "assistant", "content": assistant})
#             convs["history"].append(chat_template)
#             convs["target_utterance"].append(assistant)
#         i += 1
#     return convs


# def extract_conversation(df):
#     conv = df["last_conversation"].tolist()
#     target = df["target_utterance"].tolist()
#     idx = [i for i in range(len(conv))]

#     chat = []
#     last = conv.copy()
#     target = target.copy()

#     try:
#         for ids, last_conv, target_utter in zip(idx, conv, target):
#             if ids == 0:
#                 chat.append(last_conv)
#                 chat.append(target_utter)
#             if ids >= 1:
#                 if type(last[ids]) == str and last[ids] != '':
#                 idx = len(target[ids-1]) + last[ids].index(target[ids-1]) + 1
#                 updated_conv = last_conv[idx:]
#                 chat.append(updated_conv)
#                 chat.append(target_utter)
#     except ValueError:
#         pass
#     return chat


# def prepare_dialogue(example, tokenizer):
#     text = ""
#     for idx, msg in enumerate(example):
#         if idx % 2 == 0:
#             text += f"<|user|>\n{msg}{tokenizer.eos_token}\n"
#         else:
#             text += f"<|assistant|>\n{msg}{tokenizer.eos_token}\n"
#     return text