# Synthetic Dataset generator

In [22]:
import json
import random
with open('prompts.json', "r", encoding="utf-8") as json_file:
    task_prompts = json.load(json_file)

list(task_prompts.keys())

['emotion_detection',
 'emotion_generation',
 'emotion_analysis',
 'conversational_context',
 'emotional_summarization',
 'emotion_comparison',
 'emotion_explanation',
 'emotion_detection_in_social_media',
 'emotion_engagement',
 'emotion_based_recommendations',
 'emotion_and_context',
 'cross_domain_emotion_detection',
 'creative_writing_with_emotions',
 'combining_emotions_with_other_data',
 'multi_modal_emotion_analysis',
 'emotion_based_querying',
 'complex_emotion_tasks',
 'integrating_emotions_in_machine_learning_workflows']

In [35]:
import re 
# recognize the placeholders in the prompt {placeholder}
def find_placeholders(prompt):
    return re.findall(r'\{([^\}]*)\}', prompt)

placeholders = set()
for task, prompts in task_prompts.items():
    print(task, len(prompts))
    for prompt in prompts:
        assert "input_template" in prompt, f"input_template missing for {task}"
        assert "output_template" in prompt, f"output_template missing for {task}"
        assert prompt.keys() == {"input_template", "output_template"}, f"unknown keys in prompt for {task}"
        input_placeholders = find_placeholders(prompt["input_template"])
        output_placeholders = find_placeholders(prompt["output_template"])
        placeholders.update(input_placeholders)
        placeholders.update(output_placeholders)
        prompt_placeholders = set(input_placeholders + output_placeholders)
        print(prompt_placeholders)

print(placeholders)

emotion_detection 5
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
emotion_generation 5
{'emotions', 'sentence'}
{'emotions', 'sentence'}
{'emotions', 'sentence'}
{'emotions', 'sentence'}
{'emotions', 'sentence'}
emotion_analysis 13
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
{'sentence', 'emotions'}
conversational_context 2
{'second_sentence', 'sentence', 'emotions'}
{'response_style', 'sentence', 'response'}
emotional_summarization 1
{'sentence', 'emotions'}
emotion_comparison 1
{'second_sentence', 'sentence', 'emotions'}
emotion_explanation 1
{'emotions', 'response', 'sentence'}
emotion_detection_in_social_media 2
{'sentence', 'emotions'}
{'sentence', 'em

In [36]:
import pandas as pd
emotions_df = pd.read_parquet("go_emotions_train.parquet")
emotions_df.head()

emotion_columns = emotions_df.columns[1:29]
print(emotion_columns)

emotions_df["emotions"] = emotions_df[emotion_columns].apply(lambda x: x[x == 1].index.tolist(), axis=1)
emotions_df["emotions_text"] = emotions_df["emotions"].apply(lambda x: ", ".join(x))
emotions_df["emotions_text"].head()

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')


36949                   disapproval
13467    approval, desire, optimism
12483      caring, desire, optimism
3633         curiosity, realization
23998                     curiosity
Name: emotions_text, dtype: object

In [37]:
targeted_prompts = ["emotion_detection", "emotion_generation"]
finetune = []
gen_per_prompt = 3
for task in targeted_prompts:
    print(task, len(task_prompts[task]))
    for index, row in emotions_df.iterrows():
        emotions = row["emotions_text"]
        sentence = row["text"]
        for i in range(gen_per_prompt):
            selected_prompt = random.choice(task_prompts[task])
            instruction = f"You are an {task.replace('_', ' ')} specialist."
            input = selected_prompt["input_template"].format(sentence=sentence, emotions=emotions)
            output = selected_prompt["output_template"].format(sentence=sentence, emotions=emotions)
            finetune.append({"task": task, "instruction": instruction, "input": input, "output": output, "emotions": emotions, "sentence": sentence})

finetune_df = pd.DataFrame(finetune)
finetune_df

emotion_detection 5
emotion_generation 5


Unnamed: 0,task,instruction,input,output,emotions,sentence
0,emotion_detection,You are an emotion detection specialist.,Identify all the emotions present in the follo...,The emotions are : disapproval,disapproval,Stop opening the pictures. Stop giving her the...
1,emotion_detection,You are an emotion detection specialist.,Classify the emotions and their intensity in t...,This text exposes disapproval as emotions,disapproval,Stop opening the pictures. Stop giving her the...
2,emotion_detection,You are an emotion detection specialist.,Classify the emotions expressed in the followi...,disapproval,disapproval,Stop opening the pictures. Stop giving her the...
3,emotion_detection,You are an emotion detection specialist.,Detect the emotions in the following text: 'Hu...,"approval, desire, optimism","approval, desire, optimism","Huh. As a man, I have support from friends and..."
4,emotion_detection,You are an emotion detection specialist.,Identify all the emotions present in the follo...,"The emotions are : approval, desire, optimism","approval, desire, optimism","Huh. As a man, I have support from friends and..."
...,...,...,...,...,...,...
309085,emotion_generation,You are an emotion generation specialist.,"Create a statement that conveys admiration, am...","The expression: Afro samurai lol, that's prett...","admiration, amusement, disapproval","Afro samurai lol, that's pretty interesting ne..."
309086,emotion_generation,You are an emotion generation specialist.,"Create a statement that conveys admiration, am...","The expression: Afro samurai lol, that's prett...","admiration, amusement, disapproval","Afro samurai lol, that's pretty interesting ne..."
309087,emotion_generation,You are an emotion generation specialist.,Create a phrase that delivers the desired ange...,Presenting the conveyed emotion: > he might no...,"anger, annoyance, disappointment",> he might not have seen the truth by the time...
309088,emotion_generation,You are an emotion generation specialist.,"Produce a statement that communicates: anger, ...",Here is the expressed sentiment: > he might no...,"anger, annoyance, disappointment",> he might not have seen the truth by the time...


In [38]:
from datasets import Dataset

dataset = Dataset.from_pandas(finetune_df)
dataset.push_to_hub("sebdg/fine-tune-emotions")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/310 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/464 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sebdg/fine-tune-emotions/commit/b917f2c5662f62294e02f0b8d1ea7398311337af', commit_message='Upload dataset', commit_description='', oid='b917f2c5662f62294e02f0b8d1ea7398311337af', pr_url=None, pr_revision=None, pr_num=None)