## Imports

In [None]:
import dspy
import random
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer

## Configure LM and Read-in Topics

In [None]:
# Set up model
lm = dspy.OpenAI(model='gpt-4-0125-preview', api_key="API_KEY")
dspy.settings.configure(lm = lm)
# Load in topic model results
df = pd.read_csv("topic-data.csv")

## Functions 

In [None]:
# Function for top n tfidf values in row and return them with their corresponding feature names
def top_tfidf_feats(row, features, top_n):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['feature', 'tfidf'])
    return df
# Function to generate a randomized topic from a predefined list
def generate_topic(topic_list):
    return random.choice(topic_list)

# Function to generate a randomized list of business keywords
def generate_keywords(all_keywords):
    return random.sample(all_keywords, random.randint(1, 10))

# Function for generating the texts length
def generate_length(lengths):
    return random.choice(lengths)

# Function to randomly choose a sentiment
def generate_sentiment(sentiment_value):
    #sentiments = ['negative','positive', 'neutral']
    #return random.choice(sentiments)
    return sentiment_value

# Wrapper function to generate topic, keywords, and sentiment
def generate_article_input():
    return {
        "topic": generate_topic(topic_list),
        "keywords": generate_keywords(all_keywords),
        "length" : generate_length(lengths),
        "sentiment": generate_sentiment(sentiment_value)
    }

## DSPy Generator 

In [None]:
# DSPy generator
class GenerateArticle(dspy.Signature):
    """Generate a short textual article"""
    topic = dspy.InputField(desc="Contains the desired topic, keywords, word-count range, and sentiment")
    keywords = dspy.InputField(desc="A list of keywords that should be incorporated")
    count_range = dspy.InputField(desc="Word count range of the output")
    sentiment = dspy.InputField(desc="The textual sentiment that should be expressed in output")
    answer = dspy.OutputField(desc="Generate a randomized high quality financial-news text from the inputs")
generate_answer = dspy.Predict(GenerateArticle, temperature=0.7, n=1)

## Synthetic News Article Inputs

In [None]:
# Inputs for synthetic news articles
numbers = [0,1,2]
sentiments = ['negative','neutral', 'positive']
lengths = [10,15,20,25,30,35,40,45,50]

## Generating Synthetic News Articles

In [None]:
# Wrapper
for i,k in zip(numbers, sentiments):
    text = df[df["Sentiment"]==i]
    sentiment_value = k
    
    # TFIDF extractor
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, token_pattern=r'(?u)\b[A-Za-z]+\b')  # Only match words with English letters
    topic_list = list(set(text["Name"]))
    tfidf_matrix = vectorizer.fit_transform(text["Document"])
    feature_names = vectorizer.get_feature_names_out()
    df_tfidf = pd.DataFrame(tfidf_matrix.T.toarray(), index=feature_names)
    top_tfidf = top_tfidf_feats(df_tfidf[0].values, feature_names, top_n=100)
    all_keywords = list(top_tfidf["feature"])
    
    # Generator
    record = [] 
    for i in range(0,1292): 
        article_input = generate_article_input() 
        pred = generate_answer(topic=str(article_input["topic"]), 
                               keywords=str(article_input["keywords"]), 
                               count_range= str(article_input["length"]), 
                               sentiment=str(article_input["sentiment"]),
                               lm=lm) 
        record.append((pred.values()[0], article_input['sentiment']))
    sentiment_data = pd.DataFrame(record, columns=["Text", "Sentiment"])
    sentiment_data.to_csv("synthetic-training"+ sentiment_value + ".csv", index=False)