In [1]:
import sys
sys.path.append('../utilities/')

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import torch
import json
import re



In [None]:
import os
from dotenv import load_dotenv

# **Tools Setup**

In [None]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **DataFrame Reading**

In [6]:
df = pd.read_csv('../data/initial_datasets/reddit_binary_labels.csv')

In [7]:
sample_df = df.sample(n=1000)

In [8]:
sample_df

Unnamed: 0,text,label
5997,"Thank you OP, I love you (no homo if you’re a ...",1
26923,I’m convinced WB is more of a pain than it’s w...,1
29783,Me too. It’s my favorite. His cadence in sayin...,1
19077,Ive gone through very similar things...i compl...,-1
37812,Feels weird when something that wasn't bad but...,-1
...,...,...
36605,Nice man!! Wish you guys the best. Here I am F...,1
34102,I think I could watch an entire show with just...,1
22308,The beautiful moment when Fridays and Sundays ...,1
28812,I only remember her getting flamed by Thooorin...,-1


# **Control Chat Generation**

In [5]:
instruction = (
    "You are a data generator tasked with creating realistic Reddit comments. "
    "These comments should be labeled according to their sentiment: positive or negative.\n"
    "Base the style on typical Reddit comments — include informal internet language, typos, abbreviations, and emojis.\n"
    "Use [NAME] as a placeholder anytime a person's name would appear.\n"
    "Generate exactly 10 realistic Reddit comments, one per line.\n"
    "Each line should follow this format: the comment in double quotes, followed by a space and then the label (1 for positive, -1 for negative).\n"
    "No extra formatting — just plain text output, one line per comment.\n"
    "Here is the format:\n"
    "\"I love pizza\" 1\n"
    "\"I hate baseball\" -1"
)
input = (
    "Generate the 10 new comments below:"
)

In [16]:
res = []
for i in range(50):
    response = client.responses.create(
        model="gpt-4o",
        instructions=instruction,
        input=input
    )
    res.append(response.output_text)

In [17]:
labels = []
sentences = []
for i in range(50):
    for word in res[i].split("\n"):
        match = re.match(r'"(.*?)"\s*(-?\d+)', word)
        if match:
            quoted = match.group(1)      
            label = match.group(2)       
            sentences.append(quoted)
            labels.append(int(label))

In [18]:
generated_df = pd.DataFrame({
    'sentences': sentences,
    'labels': labels
})

In [19]:
generated_df

Unnamed: 0,sentences,labels
0,"Wow, this game is freaking amazing! 😍",1
1,"Ugh, can't stand this band anymore, so overrated.",-1
2,Just got a new puppy and I'm soooo in love 🐶,1
3,Why do people even like this movie? It's trash.,-1
4,Had the best day with friends at the park toda...,1
...,...,...
495,"Honestly, this movie was a huge letdown 🤦‍♂️",-1
496,"Found the cutest puppy today, made my year! 🐶",1
497,Service at this restaurant was awful 👎,-1
498,This book is an absolute masterpiece 📚,1
