In [1]:
import sys
sys.path.append('../utilities/')

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



# **Tools Setup**

In [3]:
API_KEY = "***REMOVED***proj-0UjcP502wYIPoNWqF8jfT3BlbkFJIxTanFmlBGh0Om851d4z"
client = OpenAI(api_key=API_KEY)

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **DataFrame Reading**

In [5]:
df = pd.read_csv('../data/initial_datasets/reddit_train.csv')

In [6]:
sample_df = df.sample(n=1000)

In [7]:
sample_df

Unnamed: 0,text,label
19514,Thank you! I will!,1
11292,Im desperately hoping that that is either sarc...,1
16166,"That was a very well put together video, enter...",1
40564,I would try to find someone who wasn’t married...,1
6347,Which makes zero sense lol,1
...,...,...
1794,"Their mere presence is all it takes, but being...",1
31518,"We love him, duh. He's our NT captain and our ...",1
1034,Thank you for your advice! I really do appreci...,1
23796,>YOUR STATEMENT I'm not that person.,1


# **Aggregate Stats**

In [8]:
sample_df['phrase_len'] = sample_df['text'].str.len()

In [9]:
avg_len = sample_df['phrase_len'].mean()
std_len = sample_df['phrase_len'].std()

## **Whole data Frequency**

In [10]:
sample_df['text'] = sample_df['text'].astype(str)

In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_df['text'])
counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
total_counts = counts.sum().sort_values(ascending=False)

In [12]:
total_counts = total_counts.reset_index()
total_counts.columns = ['text', 'count']
total_counts['freq'] = total_counts['count'] / total_counts['count'].sum()

## **Postive data Frequency**

In [13]:
positive_df = sample_df[sample_df['label'] == 1]
X = vectorizer.fit_transform(positive_df['text'])
counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
positive_counts = counts.sum().sort_values(ascending=False)

## **Negative data Frequency**

In [14]:
negative_df = sample_df[sample_df['label'] == -1]
X = vectorizer.fit_transform(negative_df['text'])
counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
negative_counts = counts.sum().sort_values(ascending=False)

## **Compare Across Both**

In [15]:
positive_counts = positive_counts.reset_index()
negative_counts = negative_counts.reset_index()

In [16]:
positive_counts.columns = ['text', 'count']
negative_counts.columns = ['text', 'count']

In [17]:
positive_counts['freq'] = positive_counts['count'] / positive_counts['count'].sum()
negative_counts['freq'] = negative_counts['count'] / negative_counts['count'].sum()

In [18]:
merged_df = pd.merge(positive_counts, negative_counts, on=['text'], how='outer').sort_values(by='freq_y', ascending=False)

In [19]:
top_100 = total_counts.iloc[:100].merge(merged_df, on=['text'], how='left')

In [20]:
top_100 = top_100.drop(columns=['count', 'count_x', 'count_y'])

In [21]:
top_100.rename(columns={'freq_x': 'positive_freq', 'freq_y': 'negative_freq'}, inplace=True)

In [22]:
top_100 = top_100.drop(columns=['freq'])

In [23]:
top_100

Unnamed: 0,text,positive_freq,negative_freq
0,the,0.032715,0.034248
1,to,0.024809,0.024034
2,it,0.021947,0.020028
3,and,0.019357,0.021029
4,you,0.021538,0.017424
...,...,...,...
95,made,0.001499,0.001602
96,their,0.001363,0.001803
97,who,0.001908,0.001001
98,want,0.001636,0.001402


# **Chat Generation**

## **Instruction Based Prompting**

In [24]:
common_words = "\n".join([
    f"{row['text']}: {row['positive_freq']} {row['negative_freq']}"
    for _, row in top_100.iterrows()
])

In [25]:
instruction = (
    "You are a data generator tasked with creating realistic Reddit comments. "
    "These comments should be labeled according to their sentiment: positive or negative.\n"
    "Base the style on typical Reddit comments — include informal internet language, typos, abbreviations, and emojis.\n"
    "You will be given statistics about the distribution, including average comment length, standard deviation, and most common words associated with each label.\n"
    "Use [NAME] as a placeholder anytime a person's name would appear.\n"
    "Generate exactly 10 realistic Reddit comments, one per line.\n"
    "Each line should follow this format: the comment in double quotes, followed by a space and then the label (1 for positive, -1 for negative).\n"
    "No extra formatting — just plain text output, one line per comment.\n"
    "Here is the format:\n"
    "\"I love pizza\" 1\n"
    "\"I hate baseball\" -1"
)
input = (
    f"Here is the average length of all the comments: {avg_len}. "
    f"Here is the standard deviation of the length for all the comments: {std_len}.\n"
    f"Here are the most frequent words (format: word: frequency (sentiment)):\n\n{common_words}\n"
    f"Now, generate the 10 new comments below:"
)

In [134]:
print(instruction)

You are a data generator tasked with creating realistic Reddit comments, these comments should be labeled according to the sentiment, positive or negative.
Base the style on what Reddit comments should look like, which contain informal internet language including typos, abbreviations, and emojis.
You will also be given some statistics about the distribution, such as the average length and the standard deviation of the comments as well as the frequency of the most common words in positive and negative labelsUse [NAME] as a placeholder anytime a person's name would appear. Generate exactly 10 new and realistic Reddit comments, one per line. Label the comments with a 1 for positive or -1 for negative, attach a space at the end of the comment followed by the label. Separate each comment with a newline only. No extra formatting, just plain text.Here is the format:'I love pizza! 1''I hate baseball -1'


In [136]:
response = client.responses.create(
    model="gpt-4o",
    instructions=instruction,
    input=input
)
print(response.output_text)

"I'm really loving the new update, it's so smooth! 😄" 1  
"I can't believe how buggy this app has become, it's the worst!" -1  
"The movie was amazing, I had such a great time with [NAME]!" 1  
"Ugh, the service at that restaurant was terrible, never going back." -1  
"I'm grateful for the support from everyone, it means a lot to me." 1  
"Wish they would fix the issues, the app crashes every time." -1  
"[NAME] is the best, always makes the experience so much fun!" 1  
"This product was not worth the money, very disappointed." -1  
"Had an awesome weekend with friends, can't wait to do it again!" 1  
"The package arrived damaged, I'm so annoyed with their service." -1


In [38]:
res = []
for i in range(50):
    response = client.responses.create(
        model="gpt-4o",
        instructions=instruction,
        input=input
    )
    res.append(response.output_text)

In [141]:
print(res[0].split("\n"))

['"I\'m so happy for [NAME], they really deserve the promotion! 🎉" 1  ', '"Had the worst experience at the restaurant yesterday, not going back. 😒" -1  ', '"Can\'t believe how awesome the concert was last night, still buzzing! 🎶" 1  ', '"[NAME] really messed up this time, can\'t trust them anymore. 🙄" -1  ', '"Just finished the book you recommended, it was amazing, thank you!" 1  ', '"Their service was terrible, waited over an hour for cold food. 😠" -1  ', '"Honestly, that movie blew my mind, can\'t stop thinking about it." 1  ', '"I don\'t understand why people hype that game, it\'s so boring." -1  ', '"Finally met [NAME] today and they were just as cool IRL!" 1  ', '"That meeting was a waste of time, could have been an email. 🙄" -1']


In [143]:
temp = res[0].split("\n")[0]
match = re.match(r'"(.*?)"\s*(-?\d+)', temp)

if match:
    quoted = match.group(1)      # 'I love pizza'
    label = match.group(2)       # '1'
    print("Quoted text:", quoted)
    print("Label:", label)

Quoted text: I'm so happy for [NAME], they really deserve the promotion! 🎉
Label: 1


In [39]:
labels = []
sentences = []
for i in range(50):
    for word in res[i].split("\n"):
        match = re.match(r'"(.*?)"\s*(-?\d+)', word)
        if match:
            quoted = match.group(1)      
            label = match.group(2)       
            sentences.append(quoted)
            labels.append(int(label))

In [40]:
generated_df = pd.DataFrame({
    'sentences': sentences,
    'labels': labels
})

In [36]:
first_df = generated_df

In [42]:
pd.concat([first_df, generated_df]).to_csv('../data/generated/reddit/privacy_protected/new_reddit_agg_stats_gen.csv', index=False)

In [33]:
generated_df.to_csv('../data/generated/reddit/privacy_protected/new_reddit_agg_stats_gen.csv', index=False)

## **Metrics for Generated Data**

In [30]:
generated_df['type'] = 'generated'
sample_df['type'] = 'original'

In [31]:
generated_df

Unnamed: 0,sentences,labels,type
0,I'm so ready for the new season of [NAME]'s sh...,1,generated
1,"Ugh, I can't believe how bad the movie was, de...",-1,generated
2,"Just finished the book and omg it was amazing,...",1,generated
3,"The weather today has been just awful, can't w...",-1,generated
4,Played the game with friends and we had such a...,1,generated
...,...,...,...
495,"Heard that new album, and honestly, it was not...",-1,generated
496,"Finally got around to cleaning my room, feels ...",1,generated
497,"Netflix removed my favorite show again, so fru...",-1,generated
498,Big shoutout to [NAME] for helping me out toda...,1,generated


In [34]:
sample_df = sample_df.rename(columns={'text': 'sentences', 'label': 'labels'})

In [35]:
tot_df = pd.concat([sample_df[['sentences', 'labels', 'type']], generated_df], axis=0).reset_index(drop=True)

In [196]:
sample_orig = sample_df.sample(n=700)

In [197]:
gen_sample = tot_df[tot_df['type'] == 'generated'].sample(n=100)

In [198]:
train = pd.concat([sample_orig[['sentences', 'labels', 'type']], gen_sample], axis=0).reset_index(drop=True)

In [199]:
train = train.sample(frac=1)

In [201]:
test_orig = sample_df.drop(sample_orig.index)

In [208]:
test = test_orig[['sentences', 'labels']]

In [207]:
X_train = train['sentences'].to_list()
y_train = train['labels'].to_list()

In [209]:
X_test = test['sentences'].to_list()
y_test = test['labels'].to_list()

In [210]:
X_train = np.array(sentence_transformer.encode(X_train))

In [211]:
X_test = np.array(sentence_transformer.encode(X_test))

In [222]:
model = svm.SVC(C=1, degree=2, gamma='scale', kernel='linear', probability=True)

In [219]:
test_orig['type'].value_counts()

type
original    300
Name: count, dtype: int64

In [223]:
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.88375
Test acc: 0.77


In [220]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8465608465608465
0.8


In [227]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8372999999999999
