In [32]:
Hawkish = [
    "business", "businesses", "demand", "economy", "employment", "energy",
    "equity", "expansion", "financial", "housing", "income", "indicators",
    "inflationary", "investment", "investments", "manufacturing", "outlook",
    "inflation", "prices", "outhut", "labor", "securities", "slack",
    "recovery", "toll", "wage", "resource"
]

Dovish = [
    "accommodation", "devastation", "downturn", "recession", "unemployment"
]

Positive = [
    "abating", "accelerated", "add", "advance", "advanced", "augmented",
    "balanced", "better", "bolsters", "boom", "booming", "boost", "boosted",
    "eased", "elevated", "elevating", "expand", "expanding", "expansionary",
    "extend", "extended", "fast", "faster", "firmer", "gains", "growing",
    "heightened", "high", "higher", "improved", "improvement", "improving",
    "increase", "increased", "increases", "increasing", "more", "raise",
    "rapid", "rebounded", "recovering", "rise", "risen", "rising", "robust",
    "rose", "significant", "solid", "sooner", "spike", "spikes", "spiking",
    "stable", "strength", "strengthen", "strengthened", "strengthens",
    "strong", "stronger", "supportive", "up", "upside", "upswing", "uptick"
]

Negative = [
    "adverse", "back", "below", "constrained", "contract", "contracting",
    "contraction", "cooling", "correction", "dampen", "damping", "decelerated",
    "decline", "declined", "declines", "declining", "decrease", "decreases",
    "decreasing", "deepening", "depressed", "deteriorated", "deterioration",
    "diminished", "disappointing", "dislocation", "disruptions", "down",
    "downbeat", "downside", "drop", "dropping", "ebbed", "erosion", "fade",
    "faded", "fading", "fall", "fallen", "falling", "fell", "insufficient",
    "less", "limit", "low", "lower", "moderated", "moderating", "moderation",
    "reduce", "reduced", "reduction", "reluctant", "removed", "restrain",
    "restrained", "restraining", "restraint", "resumption", "reversed",
    "slack", "slow", "slowed", "slower", "slowing", "slowly", "sluggish",
    "sluggishness", "slumped", "soft", "softened", "softening", "stimulate",
    "strained", "strains", "stress", "subdued", "tragic", "turmoil",
    "underutilization", "volatile", "vulnerable", "wary", "weak", "weakened",
    "weaker", "weakness"
]

In [34]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Tokenize and clean text
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    tokens = word_tokenize(text)
        
    # Convert to lowercase and remove non-alphabetic characters
    words = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    clean_words = [word for word in words if word not in stop_words]
    
    return clean_words

# Calculate sentiment score for a sentence
def sentiment_score(sentence, Hawkish, Dovish, Positive, Negative):
    tokens = clean_and_tokenize(sentence)
#     print(tokens)
    hawkish_count = sum([1 for token in tokens if token in Hawkish])
    dovish_count = sum([1 for token in tokens if token in Dovish])
    positive_count = sum([1 for token in tokens if token in Positive])
    negative_count = sum([1 for token in tokens if token in Negative])
    
    if hawkish_count > dovish_count:
        if positive_count > negative_count:
            return 1
        elif positive_count < negative_count:
            return -1
        else:
            return 0
    elif dovish_count > hawkish_count:
        if positive_count > negative_count:
            return -1
        elif positive_count < negative_count:
            return 1
        else:
            return 0
    else:
        return 0

# Calculate average sentiment score for a document
def document_sentiment(document, Hawkish, Dovish, Positive, Negative):
    sentences = document.split('.')
    scores = [sentiment_score(sentence, Hawkish, Dovish, Positive, Negative) for sentence in sentences]
    return sum(scores) / len(scores)

df = pd.read_csv('data.csv')

# Calculate sentiment scores and add them as a new column to the dataframe
df['sentiment_score'] = df['text'].apply(lambda x: document_sentiment(x, Hawkish, Dovish, Positive, Negative))

In [64]:
sentiment_training_data = []

for ts in df['text']:   
    sentences = ts.split('.')
    for sentence in sentences:
        sentiment_training_data.append({
            'text': sentence,
            'label': sentiment_score(sentence, Hawkish, Dovish, Positive, Negative)
        })
        
sentiment_training_data = pd.DataFrame(sentiment_training_data)

In [5]:
df = pd.read_csv('Fed_Scrape-2015-2023.csv')
df = df[["Type","Text"]]
df = df[df["Type"] == 0]

stmts_data = []

for ts in df['Text']:   
    sentences = ts.split('.')
    for sentence in sentences:
        stmts_data.append({
            'text': sentence,
            'label': sentiment_score(sentence, Hawkish, Dovish, Positive, Negative)
        })
        
stmts_data = pd.DataFrame(stmts_data)

In [29]:
df = pd.read_csv('Fed_Scrape-2015-2023.csv')
df = df[["Type","Text"]]
df = df[df["Type"] == 1]

mn_data = []

for ts in df['Text']:   
    sentences = ts.split('.')
    for sentence in sentences:
        mn_data.append({
            'text': sentence,
            'label': sentiment_score(sentence, Hawkish, Dovish, Positive, Negative)
        })
        
mn_data = pd.DataFrame(mn_data)

In [11]:
stmts_data[stmts_data["label"] != 0].to_csv('stmts_data.csv', index=False)

In [30]:
mn_data[mn_data["label"] != 0].to_csv('mn_data.csv', index=False)

In [67]:
sentiment_training_data[sentiment_training_data['label'] != 0].to_csv('sentiment_data.csv',index=False)