# Synthetic Dataset generator

In [21]:
%pip install ollama langchain langchain_community

Collecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting langchain-core<0.3.0,>=0.2.7 (from langchain)
  Downloading langchain_core-0.2.7-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl.metadata (2.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.77-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3,>=1 (from langchain)
  Downloading pydantic-2.7.4-py3-none-any.whl.metadata (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.4/109.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting tenacity<9.0.0,>=8.1.

In [91]:
import json
with open('prompts.json', "r", encoding="utf-8") as json_file:
    task_prompts = json.load(json_file)

list(task_prompts.keys())

['emotion_detection_and_classification',
 'emotion_generation',
 'emotion_analysis',
 'conversational_context',
 'emotional_summarization',
 'emotion_comparison',
 'emotion_explanation',
 'emotion_detection_in_social_media',
 'emotion_engagement',
 'emotion_based_recommendations',
 'emotion_and_context',
 'cross_domain_emotion_detection',
 'creative_writing_with_emotions',
 'combining_emotions_with_other_data',
 'multi_modal_emotion_analysis',
 'emotion_based_querying',
 'complex_emotion_tasks',
 'integrating_emotions_in_machine_learning_workflows']

In [19]:
import re 
# recognize the placeholders in the prompt {placeholder}
def find_placeholders(prompt):
    return re.findall(r'\{([^\}]*)\}', prompt)


placeholders = set()
task_placeholders = {}
for task, prompts in task_prompts.items():
    prompt_placeholders = list()
    for prompt in prompts:
        assert "input_template" in prompt, f"input_template missing for {task}"
        assert "output_template" in prompt, f"output_template missing for {task}"
        assert prompt.keys() == {"input_template", "output_template"}, f"unknown keys in prompt for {task}"
        input_placeholders = find_placeholders(prompt["input_template"])
        output_placeholders = find_placeholders(prompt["output_template"])
        placeholders.update(input_placeholders)
        placeholders.update(output_placeholders)
        prompt_placeholders = list(set(prompt_placeholders + input_placeholders + output_placeholders))
    prompt_placeholders.sort()
    task_placeholders[task] = prompt_placeholders

print(task_placeholders)

{'emotion_detection_and_classification': ['emotions', 'sentence'], 'emotion_generation': ['emotions', 'sentence'], 'emotion_analysis': ['emotions', 'sentence'], 'conversational_context': ['emotions', 'response', 'response_style', 'second_sentence', 'sentence'], 'emotional_summarization': ['emotions', 'sentence'], 'emotion_comparison': ['emotions', 'second_sentence', 'sentence'], 'emotion_explanation': ['emotions', 'response', 'sentence'], 'emotion_detection_in_social_media': ['emotions', 'sentence'], 'emotion_based_recommendations': ['response', 'sentence'], 'emotion_and_context': ['context', 'emotions', 'response', 'sentence'], 'cross_domain_emotion_detection': ['emotions', 'sentence'], 'creative_writing_with_emotions': ['emotions', 'sentence'], 'combining_emotions_with_other_data': ['emotions', 'sentence'], 'multi_modal_emotion_analysis': ['emotions', 'image_description', 'sentence'], 'emotion_based_querying': ['emotions', 'response'], 'complex_emotion_tasks': ['emotions', 'sentence'

In [92]:
# only take tasks with the same placeholders ['emotions', 'sentence']
tasks = [task for task, task_placeholders in task_placeholders.items() if task_placeholders == ['emotions', 'sentence']]
tasks

['emotion_detection_and_classification',
 'emotion_generation',
 'emotion_analysis',
 'emotional_summarization',
 'emotion_detection_in_social_media',
 'cross_domain_emotion_detection',
 'creative_writing_with_emotions',
 'combining_emotions_with_other_data',
 'complex_emotion_tasks',
 'integrating_emotions_in_machine_learning_workflows']

In [21]:
import ollama
from ollama import Client
from langchain_community.chat_models.ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.memory.chat_message_histories import ChatMessageHistory

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from tqdm import tqdm


In [22]:
import pandas as pd
import numpy as np

train = pd.read_parquet("go_emotions_train.parquet")
test = pd.read_parquet("go_emotions_test.parquet")
emotions = train.columns[1:29].tolist()


In [23]:
system = f"""\
You are an expert in emotions detection and generation, you will be given different textual tasks to related to the go_emotions dataset.
Only perform the required task.
"""

In [40]:
def decode_emotions_to_string(array_of_labels):
    """Decode the emotions from the row to a list of strings."""
    # active labels are 1, inactive are 0
    return [emotion for i, emotion in enumerate(emotions) if array_of_labels[emotion] == 1]



In [97]:
task_prompts["emotion_detection_and_classification"]

[{'input_template': "Identify the emotional tone present in the given statement: '{sentence}'.",
  'output_template': "The sentence expresses: '{emotions}'."},
 {'input_template': "Classify the emotions expressed in the following sentence: '{sentence}'",
  'output_template': '{emotions}'},
 {'input_template': "Identify all the emotions present in the following sentence: '{sentence}'",
  'output_template': '{emotions}'},
 {'input_template': "Classify the emotions and their intensity in the following sentence: '{sentence}'",
  'output_template': '{emotions}'}]

In [42]:
emotions

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [94]:
model_detections = []

In [98]:

import random
model_name = "phi3"
all_emotions = emotions
pp = list(task_prompts["emotion_engagement"])

target_tasks = ["emotion_detection_and_classification", "emotion_engagement", "emotion_generation", "emotion_analysis", "emotion_based_recommendations"]
for target_task in target_tasks:
    for index, row in train.iterrows():
        prompt = random.choice(pp)
        matching = decode_emotions_to_string(row[emotions])
        formatted_input = prompt["input_template"].format(emotions=", ".join(matching),sentence=row["text"], response="")
        formatted_output = prompt["output_template"].format(emotions=", ".join(matching),sentence=row["text"], response="") 
        model_detections.append({
            "model": model_name,
            "text":  row["text"], 
            "formatted_input": formatted_input, 
            "emotions": ", ".join(matching), 
            "formatted_output": formatted_output,
            "response": "",
            
            })
print(len(model_detections),"\n")


309090 



In [99]:

new_df = pd.DataFrame(model_detections)
with open("model_prompts.jsonl", "w", encoding="utf-8") as f:
    f.write(new_df.to_json(orient="records", lines=True))
new_df.to_sql("model_prompts", "sqlite:///model_prompts.db", if_exists="replace")

309090

In [101]:
import random
import concurrent.futures
from tqdm import tqdm
import pandas as pd

model_name = "llama3:8b"
chat = ChatOllama(model=model_name, temperature=0.5)
generator = prompt_for_response | chat

# Function to process each task
def process_task(index):
    prompt = random.choice(task_prompts["emotion_detection_and_classification"])
    formatted_input = prompt["input_template"].format(sentence=train.iloc[index, 0])
    response = generator.invoke({"input": formatted_input})
    
    matching = [emotion for emotion in emotions if f"**{emotion}**" in response.content.lower()]
    return {
        "model": model_name,
        "text": train.iloc[index, 0], 
        "formatted_input": formatted_input, 
        "emotions": matching, 
        "response": response.content,
        "formatted_output": prompt["output_template"].format(emotions=", ".join(matching))
    }

# Running tasks in parallel
model_detections = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Using list comprehension with tqdm for progress bar
    results = list(tqdm(executor.map(process_task, range(3000)), total=3000))

# Converting results to DataFrame
new_df = pd.DataFrame(results)
new_df


100%|██████████| 3000/3000 [44:16<00:00,  1.13it/s] 


Unnamed: 0,model,text,formatted_input,emotions,response,formatted_output
0,llama3:8b,"As long as the minority aren't ""minorities"" I ...",Classify the emotions and their intensity in t...,"[anger, disapproval]",Here are the detected emotions:\n\n* **disappr...,"anger, disapproval"
1,llama3:8b,"Same, I still need to sleep to go to work tomo...",Classify the emotions expressed in the followi...,"[disappointment, sadness, neutral]",The emotions present in this text are:\n\n* **...,"disappointment, sadness, neutral"
2,llama3:8b,"Bide your time, keep your eyes open, and build...",Identify all the emotions present in the follo...,[],Here are the emotions I detect:\n\n* **patienc...,
3,llama3:8b,Clock the flair but What are the judges on? I ...,Classify the emotions expressed in the followi...,"[amusement, relief, surprise]",Here are the emotions I detected:\n\n* **Surpr...,"amusement, relief, surprise"
4,llama3:8b,Who's the top three craft brewers in the area??,Classify the emotions expressed in the followi...,[curiosity],"Based on the sentence, I detect the presence o...",curiosity
...,...,...,...,...,...,...
2995,llama3:8b,"Not sure how you could put [NAME] there, he's ...",Identify all the emotions present in the follo...,"[anger, disapproval]",The emotions present in this sentence are:\n\n...,"anger, disapproval"
2996,llama3:8b,"No, the teens were not being assholes. The ori...",Classify the emotions expressed in the followi...,"[anger, disapproval]",The emotions present in this text are:\n\n* **...,"anger, disapproval"
2997,llama3:8b,"Uh oh. Ummm, I need to take care of something.",Identify the emotional tone present in the giv...,[],"Based on the text, I identify the following em...",The sentence expresses: ''.
2998,llama3:8b,"Looking at the thread now, it seems that this ...",Identify all the emotions present in the follo...,[disappointment],The emotions present in the sentence are:\n\n*...,disappointment


In [None]:

prompt_generation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """\
You are a prompting assistant, please paraphrase the provided prompt, while retaining the meaning and the exact placeholders in curly braquets.
Only generate the requested part(input_template, output_template) of the prompt.
While you may paraphrase the prompt, please do not change the meaning of the prompt, and the placeholders.
Remain concise and to the point. Do not include any additional information such as examples or explanations.
DO not add additional constraints or requirements. DO not change any of the placeholders in the prompt(input_template, output_template).
The new prompt should be in the same format as the original prompt and a single sentence.
Do not add the task name or any other information to the prompt.
If you succeed in paraphrasing the prompt, you will get a 1000 points. If you fail, you will get 0 points.
Your mission is critical, take a deep braeth and focus on the task at hand. Good luck!\
 """,
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
    ]
)

generator =  prompt_generation_prompt | chat
history_for_chain = ChatMessageHistory()
chain_with_message_history = RunnableWithMessageHistory(
    generator,
    lambda session_id: history_for_chain,
    input_messages_key="input",
    history_messages_key="chat_history",
)

In [114]:
history_for_chain = ChatMessageHistory()

In [115]:
history_for_chain.messages

[]

In [116]:

newprompts = {}
for task in task_prompts:
    print(task)
    newprompts[task] = []
    history_for_chain = ChatMessageHistory()
    for prompt in task_prompts[task]:
        input_placeholders = find_placeholders(prompt["input_template"])
        output_placeholders = find_placeholders(prompt["output_template"])
        for i in range(5):

            response = chain_with_message_history.invoke(
                {"input": f"Generate a prompt template paraphrasing : " + prompt["input_template"]},
                {"configurable": {"session_id": "unused"}},
            )
            new_input = response.content.strip().split("\n")[0].strip()
            new_input_placeholders = find_placeholders(new_input)
            if set(new_input_placeholders) != set(input_placeholders):
                print("input placeholders mismatch")
                continue


            response = chain_with_message_history.invoke(
                {"input": f"Now generate the output_template, paraphrasing : " + prompt["output_template"]},
                {"configurable": {"session_id": "unused"}},
            )
            new_output = response.content.strip().split("\n")[0].strip()
            new_output_placeholders = find_placeholders(new_output)
            if set(new_output_placeholders) != set(output_placeholders):
                print("output placeholders mismatch")
                continue

            newprompts[task].append({"input_template": new_input,"output_template": new_output})
            print(new_input)
            print(new_output)
            print()

    with open('newprompts.json', "w", encoding="utf-8") as json_file:
        json.dump(newprompts, json_file, indent=4)

Parent run de892c14-7c04-4b37-9aa1-3eb1960cbbb2 not found for run 0d1c6883-68b2-4ee7-b98f-a07a2ae17822. Treating as a root run.


emotion_detection_and_classification


Parent run 986878b6-bd3a-4bf3-8a2a-4613ae122f31 not found for run 5e4ab6b4-20c4-4227-b422-bdf9a727ea5b. Treating as a root run.
Parent run 8a82bc14-1e06-446c-8310-56268c69a52e not found for run 510a8e02-6a5f-4472-b02b-3a2126d93d76. Treating as a root run.


Determine the sentiment conveyed by the given phrase: '{sentence}'.
{emotions}



Parent run b800faa7-b844-4b64-ae1f-da5782c3524c not found for run ddf4e43a-9c05-47bc-a8ff-78c085c33ffd. Treating as a root run.
Parent run 92f29293-5794-4f18-9fb4-f8c917538b44 not found for run ee240945-2f08-499a-ae0b-8d64ac209f31. Treating as a root run.


Identify the emotional tone of the given statement: '{sentence}'.
{emotions}



Parent run 73295dd6-2cd2-4e35-a944-9f837b10785d not found for run 3754c908-4673-4ba7-a370-f9b3fa74769d. Treating as a root run.
Parent run 610e44ec-d2ee-4406-acc8-028832689d81 not found for run 90562f3f-e350-429e-bce8-a59ea45b2063. Treating as a root run.


Analyze the emotional sentiment of the phrase: '{sentence}'.
{emotions}



In [11]:
generated

{'model': 'phi3',
 'created_at': '2024-06-17T00:45:09.0916345Z',
 'response': ' Categorize the feelings conveyed in the given statement: "{sentence}"',
 'done': True,
 'done_reason': 'stop',
 'context': [32006,
  887,
  526,
  263,
  9508,
  292,
  20255,
  29892,
  3113,
  337,
  1742,
  278,
  4944,
  9508,
  29892,
  1550,
  11551,
  292,
  278,
  6593,
  322,
  278,
  2058,
  8948,
  414,
  297,
  3151,
  368,
  4105,
  339,
  1691,
  29889,
  32007,
  32010,
  4134,
  1598,
  278,
  23023,
  1080,
  13384,
  297,
  278,
  1494,
  10541,
  29901,
  22372,
  18616,
  663,
  10162,
  32007,
  32001,
  315,
  20440,
  675,
  278,
  21737,
  27769,
  287,
  297,
  278,
  2183,
  3229,
  29901,
  29850,
  18616,
  663,
  5038,
  32007],
 'total_duration': 503039400,
 'load_duration': 1516500,
 'prompt_eval_count': 51,
 'prompt_eval_duration': 349708000,
 'eval_count': 17,
 'eval_duration': 149952000}

In [53]:
import pandas as pd

df = pd.read_parquet("hf://datasets/akshayjambhulkar/telecom-conversational-support-chat-pre-processed-with-agent/data/train-00000-of-00001.parquet")


In [66]:
df["conv"] = df["text"].apply(lambda x: str(x).replace("agent:","\n\nagent:\n").replace("client:", "\n\nclient:\n"))

In [82]:
from IPython.display import display, Markdown
display(Markdown(df["conv"].iloc[15206]))



agent:
 Good afternoon, thank you for calling Union Mobile. My name is Marcy, how can I assist you today? 

client:
 Hi Marcy, I'm calling to inquire about the cost of your mobile cloud plans. I'm tired of my current plan and want to switch to a better one. 

agent:
 Of course, I'd be happy to help you with that. Can you please verify your identity by providing me with your account PIN or the last four digits of the credit card associated with your account? 

client:
 Sure, my account PIN is 1234. 

agent:
 Great, thank you for providing that. Now, let me check on available mobile cloud plans and their corresponding costs. We have three different plans that you can choose from, depending on your needs. 

client:
 Okay, what are they? 

agent:
 Our first plan is the " Plan, which costs $5 per month and includes 5GB of data, 500 minutes of talk time, and unlimited text messages. The second plan is the Pro Plan, which costs $15 per month and includes 20GB of data, 1000 minutes of talk time, and unlimited text messages. Finally, we have the Premium Plan, which costs $30 per month and includes 50GB of data, 2000 minutes of talk time, and unlimited text messages. 

client:
 That sounds good. But can you tell me more about the data spe and how it works? 

agent:
 Absolutely. Our mobile cloud plans come with different data limitsances, which means you you can use up to a certain amount of data per month without incurring additional charges. If you exceed your data allowance, you'll be charged an additional fee per gigabyte used. However, we also offer data rollover, which means that any unused data from the previous month will be carried over to the next month. 

client:
 That sounds sense. What if I want to upgrade or downgrade my plan? 

agent:
 You can upgrade or downgrade your plan at any time by contacting our customer service team. Keep in mind that if you upgrade your plan, you'll need to pay the difference between the old and new plan prices. If you downgrade your plan, you'll receive a prorated credit for the remaining days in your billing cycle. 

client:
 Okay, I think I'll go with the Pro Plan. How do I sign up? 

agent:
 Great choice! I can take you with that. Can you please confirm your emaililling information and provide me with your email address so I can send you a copy of your new plan details? 

client:
 Sure, my email address is [marquis@email.com](mailto:marquis@email.com). And my billing information is... (provides billing information) 

agent:
 Thank you, Marquis. I've updated your account with your new plan selection. You'll receive a confirmation email with all the details shortly. Is there anything else I can assist you with today? 

client:
 No, that's all. Thanks for your help, Marcy. 

agent:
 You're welcome, Marquis. It was my pleasure to assist you. Have a great day!