In [1]:
import sys
sys.path.append('../utilities/')

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import torch



In [None]:
import os
from dotenv import load_dotenv

# **Tools Setup**

In [None]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Processing**

## **First Attempt**

- Getting rid of datapoint with multiple classes or no label at all
- Getting rid of datapoints with example very unclear

In [41]:
reddit_df = pd.read_csv('../data/initial_datasets/goemotions_1.csv')

In [42]:
cols = reddit_df.columns
start_idx = cols.get_loc('admiration')
classes = reddit_df.iloc[:, start_idx:]
reddit_df['all'] = reddit_df.iloc[:, start_idx:].sum(axis=1)

In [194]:
reddit_df['all']

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,all
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,0,1,0,0,1
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,0,1,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,0,0,0,0,0,0,0,0,0,1
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,0,1,1
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,t3_aizyuz,t1_eesoak0,1.548280e+09,61,False,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,"It's about fucking time, hope this is real.",eeuoyeh,DudeImMacGyver,worldnews,t3_ajcbgq,t3_ajcbgq,1.548346e+09,19,False,0,...,0,0,0,0,0,0,0,0,0,2
69996,This is great! Can anyone make a request with ...,eer0igi,Dirkus777,gay,t3_aiqhx1,t3_aiqhx1,1.548223e+09,55,False,1,...,0,0,0,0,0,0,0,0,0,1
69997,I’m sorry. Can you please explain what are the...,eebxspf,menjav,DebateAnAtheist,t3_ah451r,t1_eeblb39,1.547787e+09,5,False,0,...,0,0,0,0,0,0,0,0,0,1
69998,No but it should be,edjf4v2,heputmystuffinjello,DunderMifflin,t3_adpkeq,t1_edjasoe,1.546922e+09,51,False,0,...,0,0,0,0,0,0,0,0,1,1


In [43]:
reddit_df = reddit_df[reddit_df['example_very_unclear'] == False]
cleaned_data = reddit_df[reddit_df['all'] == 1].reset_index(drop=True)

# **Content Generation**

In [81]:
sentiment = 'admiration'

In [82]:
chat_data = cleaned_data[cleaned_data[sentiment] == 1].reset_index(drop=True)

In [83]:
X_train, X_test = train_test_split(chat_data['text'], test_size=0.3)

In [None]:
X_train

In [163]:
X_train.sample(n=13).to_list()

['AW YEAH HUGE WIN FOR FEMINISM AND WOMEN AMIRIGHT GUYS /s',
 'Good kid.',
 'As it should be. Great.',
 'I like you both.',
 'such a deep and powerful tweet',
 'This thread was actually inspired by one of your comments',
 'Great. Now some girl in Utah will appropriate it for her prom this year because she "appreciates" the culture.',
 'She looks really pretty',
 'She does have a great ass I’ve always thought so',
 'Nice of you to jump in for no reason, very cool.',
 "[NAME] is my favorite qb prospect and I've been watching football for 11 years",
 "[NAME], that's pretty damn good for a state like Virginia, which, although not a swing state anymore, was red not so long ago. ",
 'This is amazing']

In [84]:
text_list = X_train.to_list()
text = "\n".join(text_list)

In [190]:
instruction = (
    f"You are a data generator tasked with creating realistic Reddit comments. "
    "Each comment mimics the style of real Reddit users — including informal grammar, emojis, abbreviations, or typos — and should use [NAME] instead of any real person’s name. "
    "Follow the examples provided. After the examples, generate 10 new comments with the same style and sentiment. Each comment should be on its own line, with no numbering or formatting."
)

few_shot_examples = """
Input examples:
1. Your amazing too!
2. I also love this decline in bch prices. Cheers.
3. I've been there. It's one of the most diverse places on Earth.

Generated samples:
'I like you both.',
'such a deep and powerful tweet',
'This thread was actually inspired by one of your comments',
'Great. Now some girl in Utah will appropriate it for her prom this year because she "appreciates" the culture.',
'She looks really pretty',
'She does have a great ass I’ve always thought so',
'Nice of you to jump in for no reason, very cool.',
"[NAME] is my favorite qb prospect and I've been watching football for 11 years",
"[NAME], that's pretty damn good for a state like Virginia, which, although not a swing state anymore, was red not so long ago. ",
'This is amazing'
"""

input_block = f"Now, generate 10 new comments in the same format:"


In [191]:
input = few_shot_examples + "\n\n" + input_block

In [None]:
instruction = (
    f"You are a data generator tasked with creating realistic Reddit comments, these comments should have a sentiment of {sentiment}. "
    "Base the style loosely on the examples provided below, which contain informal internet language including typos, abbreviations, and emojis. "
    "Mimic that style at roughly the same rate, keeping similar sentiment, but ensure all generated comments are original. "
    "Use [NAME] as a placeholder anytime a person's name would appear. "
    "Generate exactly 10 new and realistic Reddit comments, one per line. "
    "Do not number or label the comments. Separate each comment with a newline only. No extra formatting, just plain text."
)
input = (
    "Here are some example comments:\n\n"
    f"{text}\n\n"
    "Now, generate the 10 new comments below:"
)

In [192]:
res = []

In [193]:
for i in tqdm(range(50)):
    response = client.responses.create(
        model="gpt-4o",
        instructions=instruction,
        input=input
    )
    res.append(response.output_text)

  0%|          | 0/50 [00:17<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [150]:
res[0].split("\n\n")

['Wow, [NAME] did an incredible job on this project. Truly inspiring! 💪',
 'Those designs are absolutely stunning. Kudos to [NAME] for such amazing creativity! 🎨✨',
 "I’m so blown away by [NAME]'s dedication. Such a hard worker and it really shows!",
 'Shoutout to [NAME] for being a total boss and leading the team to success. 🔥🙌',
 'The way [NAME] handled that situation was pure class. Major respect! 👏',
 'Absolutely LOVE how [NAME] just crushed that performance. Truly a legend!',
 'That was one epic adventure, thanks to [NAME] for the unforgettable memories! 🚀🌟',
 "I just have to admire [NAME]'s commitment to their craft. Such talent!",
 'Seeing [NAME] grow over the years has been an absolute privilege. Total legend! 👑',
 "[NAME]'s positive energy is infectious! Love having them around. 🌟"]

In [None]:
temp = []
for i in range(50):
    temp.append(res[i].split("\n\n"))

In [None]:
temp = np.concatenate(temp)

In [182]:
temp

array(['Whoa, that twist was insane! Did *not* see that coming 😂  ',
       'I’ve been playing this game all day, send help hahaha 🙃  ',
       'Glad to see them doing well, totally deserved! 🎉  ',
       'Just watched the latest ep—so many feels 💔  ',
       'That’s a wild fact! Definitely didn’t know that til now 🤯  ',
       'She’s such a queen, love everything about her style 👑  ',
       'Cannot stop listening to this song, it’s fire 🔥  ',
       'This looks like the coziest spot ever, need to visit asap ❤️  ',
       'Ate there last week, and everything was delicious! Highly recommend 🌮  ',
       'Anyone else feel personally attacked by that plot twist?! 😅',
       "What a plot twist, didn't see that coming 😂",
       "I'm honestly amazed by how chill everyone is in that sub!",
       "Saw this and literally LOL'd, my coworker gave me side-eye 🤣",
       "If it's not too personal, how'd you get started? ",
       'Man, I wish I had that kind of energy in the mornings!',
       '

In [189]:
pd.DataFrame(temp, columns=['admiration']).to_csv('../data/generated/reddit/admiration/no_sentiment.csv')

In [184]:
X_train.to_list()

['She had the glory of being a virgin martyr. Eternal glory is hers.',
 'That’s a pretty cool TIL',
 'What a rollercoaster',
 'Absolute queens! ❤🏳️\u200d🌈🏳️\u200d🌈🏳️\u200d🌈',
 'Homing pigeons make the best SOs',
 "This is an awesome idea. I'm in with my coins when this is ready to roll.",
 'Your girlfriend looks so Irish! You guys are cute as fuck.',
 'Sonofabitch that makes it so much better [NAME] Edit: you talking about this?',
 'I would love to get microblading but its so iffy about if it’ll be good or not. I’ve seen a lot of terrible turn outs',
 'Great, petty revenge instead of making your GF happy by removing [NAME] from her life, nice job!',
 'This looks really cool. r/twobestfriendsplay would love this.',
 'As it should be. Great.',
 'Ive been playing it since this post got loads of upvotes ahahha its great',
 'I workout to "the grid" a lot. It\'s a 20 minute piece that continually builds. Its wonderful.',
 "I'm honestly so pissed off for you. That whole situation is insane. G

In [154]:
X = np.array(sentence_transformer.encode(X_train.to_list()))

In [None]:
Y = np.array(sentence_transformer.encode(temp))

In [None]:
def report_MMD(X, Y, normalize=False):
    
    tensorX = torch.tensor(X)
    tensorY = torch.tensor(Y)
    rbf_mmd = MMD(tensorX, tensorY, "rbf")
    scale_mmd = MMD(tensorX, tensorY, "multiscale")


    if normalize:
        return (rbf_mmd.item() / np.sqrt((1.0 / X.shape[0]) + (1.0 / Y.shape[0]))), (scale_mmd.item() / np.sqrt(1.0 / X.shape[0] + 1.0 / Y.shape[0]))
    else:
        return rbf_mmd.item(), scale_mmd.item()

In [158]:
def p_test(X, Y, obs_mmd, n_obs=1000):
    n = X.shape[0]
    combined = np.vstack([X, Y])
    permuted_mmds = []

    for _ in tqdm(range(n_obs)):
        np.random.shuffle(combined)  
        X_perm = combined[:n]  
        Y_perm = combined[n:]
        # print(X_perm.shape)
        # print(Y_perm.shape)
        permuted_mmd = report_MMD(X_perm, Y_perm, normalize=True)[0]
        # print(permuted_mmd)
        permuted_mmds.append(permuted_mmd)
    
    permuted_mmds = np.array(permuted_mmds)
    mean_mmd = np.mean(permuted_mmds)
    std_mmd = np.std(permuted_mmd)
    p_value = np.mean(permuted_mmds >= obs_mmd)

    return p_value, mean_mmd, std_mmd, permuted_mmds

In [None]:
obs_mmd = report_MMD(X, Y, normalize=True)[0]

In [None]:
obs_mmd

0.38874855095073874

In [None]:
p, xbar, std, mmds = p_test(X, Y, obs_mmd)
print(p)
print(f'{xbar} ± {std}')

100%|██████████| 1000/1000 [02:07<00:00,  7.81it/s]

0.0
0.010219430905214521 ± 0.0



