In [1]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
import os
from dotenv import load_dotenv

# **Tools Setup**

In [3]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Control Generations**

In [5]:
instruction = (
    "You are a data generator tasked with creating realistic political speeches. "
    "These speeches should be labeled according to their political parties: Republican or Democrat.\n"
    "Base the style on typical political speeches — include typical political jargon and themes that candidates would discuss.\n"
    "Generate exactly 10 realistic speeches, one per line.\n"
    "Each line should follow this format: the speech in double quotes, followed by a space and then the label (1 for Democrat, -1 for Republican).\n"
    "No extra formatting — just plain text output, one line per comment.\n"
    "Here is the format:\n"
    "\"<speech>\" 1\n"
    "\"<speech>\" -1"
)
input = (
    "Generate the 10 new speeches below:"
)

In [6]:
res = []
for i in range(100):
    response = client.responses.create(
        model="gpt-4o",
        instructions=instruction,
        input=input
    )
    res.append(response.output_text)

In [7]:
labels = []
sentences = []
for i in range(100):
    for word in res[i].split("\n"):
        match = re.match(r'"(.*?)"\s*(-?\d+)', word)
        if match:
            quoted = match.group(1)      
            label = match.group(2)       
            sentences.append(quoted)
            labels.append(int(label))

In [8]:
generated_df = pd.DataFrame({
    'sentences': sentences,
    'labels': labels
})

In [11]:
generated_df.to_csv('../data/generated/speeches/control_synthetic_data.csv', index=False)