In [1]:
from rich import print
import json
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import json
from datasets import Dataset, Value, ClassLabel, Features, DatasetDict, load_dataset, load_from_disk

In [2]:
delete_prefix = False
random_state = 42

In [3]:
_ = load_dotenv(find_dotenv())
api_key = os.environ['OPENAI_API_KEY']

# base_url = os.environ['OPENAI_BASE_URL']
client = OpenAI()

In [4]:
system_message = "You are a helpful assistant designed to generate synthetic data."

In [5]:
user_template = """
I am a data scientist working on a text classification task with imbalanced data.
The dataset consists of responses to open-ended questions from a university survey about education.
I have selected an example from one of the underrepresented classes.
Your task is to generate five additional examples that are similar to the given example but distinct from it and from each other.


## Guidelines:

- Maintain the overall meaning of the provided example.
- Ensure the semantic meaning is preserved, as these examples will be used in various classification problems.
- You can modify some words, use synonyms, or rephrase the entire example, while keeping its original meaning.
- Ensure that the generated examples are diverse and distinguishable from one another while preserving the original example's semantic meaning.


Please provide the output in the following JSON format:
{{
"example_1": "<example_1>",
"example_2": "<example_2>",
"example_3": "<example_3>",
"example_4": "<example_4>",
"example_5": "<example_5>"
}}


Here is the example:
###
{}
###

Output:
"""

In [6]:
# def remove_prefix(x, return_prefix=False):
#     prefix_list = [
#         "أكثر ما أعجبني:",
#         "أكثر ما لم يعجبني:",
#         "اقتراحاتي للتحسين:",
#         "اقتراحاتي للإضافة:"
#     ]
    
#     x_new = x
#     prefix = ''
#     for p in prefix_list:
#         x_new = x_new.replace(p, '').strip()
#         if x != x_new:
#             prefix = p
    
#     if return_prefix:
#         return x_new.strip(), prefix
    
#     return x_new.strip()

In [7]:
data = pd.read_csv('../../data/result/train_with_sentiment_df_majority_vote_gpt4o_preferred.csv')
# data['text'] = data['text'].apply(remove_prefix)

In [8]:
data.head()

Unnamed: 0,column,text,topic_id,topic_name,sentiment,majority_vote,tie
0,like,أكثر ما أعجبني: التطبيق العملي المحدث والمشابه...,3,موازنة الجزء العملي مع الجزء النظري,Positive,Positive,False
1,improve_course,اقتراحاتي للتحسين: التطبيق والبعد عن التدريس ا...,3,موازنة الجزء العملي مع الجزء النظري,Neutral,Neutral,False
2,like,أكثر ما أعجبني: the content,0,محتوى ومعلومات المقرر,Positive,Positive,False
3,improve_course,اقتراحاتي للتحسين: provide more new books,3,موازنة الجزء العملي مع الجزء النظري,Neutral,Neutral,False
4,improve_course,اقتراحاتي للتحسين: زيادة ساعات العملي,8,الوقت و الجدول,Neutral,Neutral,False


In [9]:
len(data)

951

In [17]:
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", 
 "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": "Hello world!"}],
          "max_tokens": 2000}}

{'custom_id': 'request-1',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-3.5-turbo-0125',
  'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'},
   {'role': 'user', 'content': 'Hello world!'}],
  'max_tokens': 2000}}

In [10]:
model_name = 'gpt-4o'

In [11]:
example1 = "المحتوى أكثر من رائع"

In [22]:
jsonl = []

for i, row in data.iterrows():
    jsonl.append(
        {"custom_id": str(i), "method": "POST", "url": "/v1/chat/completions", 
         "body": {"model": model_name, "response_format":{"type": "json_object"}, "messages": [{"role": "system", "content": system_message},
                                                          {"role": "user", "content": user_template.format(row['text'], row['topic_name'])}],
              "max_tokens": 3000}}
    )

In [40]:
# jsonl_path = '../../data/data_augmentation/gpt_batch_full_data.jsonl'

jsonl_path = '../../data/data_augmentation/final_gpt_batch_full_data.jsonl'

In [24]:
with open(jsonl_path, 'w') as f:
    for entry in jsonl:
        json_line = json.dumps(entry)
        f.write(json_line + '\n')

In [25]:
batch_input_file = client.files.create(
  file=open(jsonl_path, "rb"),
  purpose="batch"
)

In [26]:
batch_input_file_id = batch_input_file.id
batch_input_file_id

'file-IS8Q5P15qMNEguOJ5vpApDDN'

In [27]:
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "augment full data final"
    },

)

Batch(id='batch_NNLNo6WHjJucrq8tDvAHlJQc', completion_window='24h', created_at=1722802932, endpoint='/v1/chat/completions', input_file_id='file-IS8Q5P15qMNEguOJ5vpApDDN', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722889332, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'augment full data final'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [32]:
print(client.batches.retrieve('batch_NNLNo6WHjJucrq8tDvAHlJQc'))

In [33]:
file_response = client.files.content('file-C9xtiIFSqPQfQ1oUM5QTZxtQ')

In [35]:
save_path = '../../data/data_augmentation/gpt_augmentation/final_full_data_5_examples.jsonl'

if os.path.exists(save_path):
    print('Path already exists!')

else:
    print(f'Saving to {save_path}')
    with open(save_path, 'w') as f:
        f.write(file_response.text)

# **Augmentation**

In [36]:
print({"id": "batch_req_123", "custom_id": "request-2", "response": {"status_code": 200, "request_id": "req_123", "body": {"id": "chatcmpl-123", "object": "chat.completion", "created": 1711652795, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello."}, "logprobs": "null", "finish_reason": "stop"}], "usage": {"prompt_tokens": 22, "completion_tokens": 2, "total_tokens": 24}, "system_fingerprint": "fp_123"}}, "error": "null"})

In [None]:
# ../../data/data_augmentation/gpt_augmentation/full_data_5_examples.jsonl

In [37]:
augmentation_data = {}

with open(save_path) as f:
    for line in f:
        json_object = json.loads(line)
        augmentation_data[json_object['custom_id']] = json.loads(json_object['response']['body']['choices'][0]['message']['content'])

In [39]:
(augmentation_data['5'])

{'example_1': 'أكثر شيء استمتعت به: الشرح الرائع للبروفيسور والمعرفة العميقة بالمادة',
 'example_2': 'ما أحببته أكثر هو: قدرة البروفيسور على الشرح الممتاز وفهمه الواسع للمادة',
 'example_3': 'الجزء الذي أعجبني كثيرًا كان: شرح البروفيسور الممتاز وخبرته الواسعة بالمادة',
 'example_4': 'أكثر عنصر نال إعجابي: توضيحات البروفيسور المتميزة وإلمامه الشامل بالموضوع',
 'example_5': 'أكثر شيء أحببته: البروفيسور يشرح بطريقة ممتازة ومعرفته بالمادة شاملة'}

In [41]:
original_data = {}
with open(jsonl_path) as f:
    for line in f:
        json_object = json.loads(line)
        original_data[json_object['custom_id']] = data.loc[int(json_object['custom_id']), 'text']

In [42]:
original_data['5']

'أكثر ما أعجبني: شرح البروفيسور ممتاز ومعرفة شاملة عن المادة'

In [43]:
augmentation_data_text = {original_data[str(i)]: augmentation_data[str(i)] 
                          for i, row in data.iterrows() if row['text'] == original_data[str(i)]}

In [44]:
len(augmentation_data_text)

951

In [45]:
(augmentation_data_text['أكثر ما أعجبني: شرح البروفيسور ممتاز ومعرفة شاملة عن المادة'])

{'example_1': 'أكثر شيء استمتعت به: الشرح الرائع للبروفيسور والمعرفة العميقة بالمادة',
 'example_2': 'ما أحببته أكثر هو: قدرة البروفيسور على الشرح الممتاز وفهمه الواسع للمادة',
 'example_3': 'الجزء الذي أعجبني كثيرًا كان: شرح البروفيسور الممتاز وخبرته الواسعة بالمادة',
 'example_4': 'أكثر عنصر نال إعجابي: توضيحات البروفيسور المتميزة وإلمامه الشامل بالموضوع',
 'example_5': 'أكثر شيء أحببته: البروفيسور يشرح بطريقة ممتازة ومعرفته بالمادة شاملة'}

In [47]:
# final_save_path = '../../data/data_augmentation/gpt_augmentation/augmentation_full_data_5_examples.json'
final_save_path = '../../data/data_augmentation/gpt_augmentation/final_augmentation_full_data_5_examples.json'

if os.path.exists(final_save_path):
    print('Path already exists!')
else:
    print(f'Saving to "{final_save_path}"...')
    with open(final_save_path, 'w') as f:
        json.dump(augmentation_data_text, f)