In [1]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
import os
from dotenv import load_dotenv

# **Tools Setup**

In [3]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **DataFrame Reading**

In [5]:
speeches_df = pd.read_csv('../data/initial_datasets/speeches/speeches_train.csv')

# **Aggregate Stats**

In [6]:
speeches_df['phrase_len'] = speeches_df['text'].str.len()

In [7]:
avg_len = speeches_df['phrase_len'].mean()
std_len = speeches_df['phrase_len'].std()

# **Word Frequencies**

In [8]:
speeches_df['text'] = speeches_df['text'].astype(str)

In [9]:
speeches_df['label'] = speeches_df['label'].replace({'Republican': -1, 'Democrat': 1})

  speeches_df['label'] = speeches_df['label'].replace({'Republican': -1, 'Democrat': 1})


# **Republican Counts**

In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(speeches_df[speeches_df['label'] == -1]['text'])
counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
rep_counts = counts.sum().sort_values(ascending=False)

In [11]:
rep_counts = rep_counts.reset_index()
rep_counts.columns = ['text', 'count']
rep_counts['freq'] = rep_counts['count'] / rep_counts['count'].sum()

In [12]:
rep_counts

Unnamed: 0,text,count,freq
0,the,32634,0.045412
1,and,25244,0.035128
2,to,19377,0.026964
3,of,16493,0.022951
4,we,16302,0.022685
...,...,...,...
14864,pakistani,1,0.000001
14865,pajamas,1,0.000001
14866,pair,1,0.000001
14867,painted,1,0.000001


# **Democratic Counts**

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(speeches_df[speeches_df['label'] == 1]['text'])
counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
dem_counts = counts.sum().sort_values(ascending=False)

In [14]:
dem_counts = dem_counts.reset_index()
dem_counts.columns = ['text', 'count']
dem_counts['freq'] = dem_counts['count'] / dem_counts['count'].sum()

In [15]:
dem_counts

Unnamed: 0,text,count,freq
0,the,10096,0.050073
1,to,6954,0.034490
2,and,6587,0.032669
3,of,5031,0.024952
4,we,4615,0.022889
...,...,...,...
8818,interior,1,0.000005
8819,interfered,1,0.000005
8820,interdisciplinary,1,0.000005
8821,intercommarble,1,0.000005


# **Compare Across Both**

In [16]:
rep_counts['rank'] = rep_counts.index+1
dem_counts['rank'] = dem_counts.index+1

In [17]:
merged_df = pd.merge(dem_counts, rep_counts, how='outer', on='rank')
merged_df = merged_df.drop(columns=['rank'])
merged_df = merged_df.rename(columns={'text_x': 'dem_text', 'freq_x': 'dem_freq', 'text_y': 'rep_text', 'freq_y': 'rep_freq'})

In [18]:
merged_df = merged_df.iloc[:100]

# **Chat Generation**

## **Instruction Based Prompting**

In [19]:
common_words = "\n".join([
    f"{row['dem_text']}: {row['dem_freq']};  {row['rep_text']}: {row['rep_freq']}"
    for _, row in merged_df.iterrows()
])

In [20]:
instruction = (
    "You are a data generator tasked with creating realistic Political speeches. "
    "These speeches should be labeled according to their political affiliation: Republican or Democrat.\n"
    "Base the style on typical political speeches — include political jargons as well as themes that would be discussed in these types of speeches\n"
    "You will be given statistics about the distribution, including average speech length, standard deviation of the lengths of the speeches, and most common words associated with each affiliation and their frequency.\n"
    "Generate exactly 10 realistic speeches, one per line.\n"
    "Each line should follow this format: the speech in double quotes, followed by a space and then the label (-1 for Republican, 1 for Democrat).\n"
    "No extra formatting — just plain text output, one line per comment.\n"
    "Here is the format:\n"
    "\"<speech>\" <label>\n"
)
input = (
    f"Here is the average length of all the speeches: {avg_len} characters. "
    f"Here is the standard deviation of the length for all the speeches: {std_len} characters.\n"
    f"Here are the most frequent words (format: word: frequency (sentiment)):\n\n{common_words}\n"
    f"Now, generate the 10 new speeches below:"
)

In [21]:
res = []
for i in range(100):
    response = client.responses.create(
        model="gpt-4o",
        instructions=instruction,
        input=input
    )
    res.append(response.output_text)

In [22]:
labels = []
sentences = []
for i in range(100):
    for word in res[i].split("\n"):
        match = re.match(r'"(.*?)"\s*(-?\d+)', word)
        if match:
            quoted = match.group(1)      
            label = match.group(2)       
            sentences.append(quoted)
            labels.append(int(label))

In [23]:
generated_df = pd.DataFrame({
    'sentences': sentences,
    'labels': labels
})

In [25]:
generated_df.to_csv('../data/generated/speeches/privacy_preserving_synthetic.csv', index=False)