In [226]:
from llama2_assets import prompt_llama2
import transformers
import pandas as pd

In [44]:
df = pd.read_csv("data/results.csv", names = ['index','number','president','speech_title','website','speech'])

In [45]:
from bs4 import BeautifulSoup

def remove_html_tags(input_string):
    soup = BeautifulSoup(input_string, "html.parser")
    cleaned_string = soup.get_text()
    if cleaned_string[-2:] == ", ":
        return cleaned_string[:-2]
    else:
        return cleaned_string


In [89]:
df['speech'][5500]

'[<p>PRESIDENT LOPEZ PORTILLO. Mr. President of the United States of America, James Carter, Mrs. Carter, ladies and gentlemen:</p>, <p>It has been 2 years now since we met for the first time. Since then, a great deal of water has flowed under the bridges of the Rio Grande. A great deal has also happened within our countries and between our countries, as it has in the world and to the world.</p>, <p>United by geography and the borders that scar it, and immersed in the conflicts of a sometimes bitter and invariably complex history, we have set out to order our conduct as neighbors according to the precepts of law and, even more important, on the favorable basis of mutual friendship. This implies good will, expressed in terms of respectful, fair, and worthy treatment.</p>, <p>Today, 2 years later, it is only fitting that we evaluate our objectives and face the facts that confront us. We know better now what each expects of the other, but I believe we also know that we have not yet put our

In [90]:
import re
def ensure_correct_speaker(cleaned_string):
    matches = re.findall(r'[A-Z ]', cleaned_string)
    if matches:
        if len(matches) == 1:
            matches = matches[0]
            if "PRESIDENT:" in matches:
                return cleaned_string.replace(matches+" ","")
    else:
        return cleaned_string


In [173]:
def ensure_no_other_speakers(speech):
    matches = re.findall(r'\b[A-Z\s]+\b[:.]', speech)
    if len(matches) <= 1:
        if matches: 
            speaker = matches[0]
            if "PRESIDENT" in speaker: 
                return speech.replace(speaker,'')[1:] 
        else:
            return speech


In [174]:
ensure_no_other_speakers("PRESIDENT: Thanks")

'Thanks'

In [175]:
ensure_no_other_speakers("MARCY. Thank you.")

In [176]:
def parse_dictation(dictation):
    dictation = remove_html_tags(dictation)
    return ensure_no_other_speakers(dictation)

In [227]:
df = df[['president','speech_title','speech']]

samples = []
for index, row in df.iterrows():
    speech = [x for x in row['speech'].replace("[","").replace("[","").split("<p>") if x != '']
    name = row['president']
    for dictation in speech:
        parsed_dictation = parse_dictation(dictation)
        if parsed_dictation:
            samples.append({"speaker":name, "text":parsed_dictation})

pd.DataFrame(samples).to_csv('data/cleaned.csv')

KeyboardInterrupt: 

In [251]:
def assemble_prompt(text, speaker):
    prompt = f"""You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.\n\nSpeaker: {speaker}\n\nExcerpt:\n{text}\n\nIf the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE."""
    return prompt

In [252]:
from numpy import random

data = []
for i in range(2):
    text = random.choice(samples)
    excerpt = text["text"]
    speaker = text["speaker"]
    prompt = assemble_prompt(excerpt, speaker)
    print(prompt)
    response = prompt_llama2(prompt)
    data.append({"text":excerpt, "speaker":speaker,"sentiment":response})


You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.

Speaker: Donald J. Trump

Excerpt:
But oil is getting to a point where, I mean, there are some areas—some people would say the water is more valuable than the oil. You never thought you're going to see that. You never thought having covered—I know you cover it—you never thought you'd be seeing oil at $20 a barrel, but how about $10 a barrel? That you never thought you'd see.


If the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE.
You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.

Speaker: Mike Pence

Excerpt:
It's amazing.


If the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE.


In [253]:
data

[{'text': "But oil is getting to a point where, I mean, there are some areas—some people would say the water is more valuable than the oil. You never thought you're going to see that. You never thought having covered—I know you cover it—you never thought you'd be seeing oil at $20 a barrel, but how about $10 a barrel? That you never thought you'd see.\n",
  'speaker': 'Donald J. Trump',
  'sentiment': 'The sentiment of the excerpt is NONE because it is neutral. It simply states a fact about the price of oil and does not express any emotion or opinion.'},
 {'text': "It's amazing.\n", 'speaker': 'Mike Pence', 'sentiment': 'NONE'}]

In [81]:
prompt_llama2(prompt+"This country is a mess")

'NEG'

In [10]:
df

Unnamed: 0.1,Unnamed: 0,speaker,text
0,0,Joseph R. Biden,My name is Joe Biden. I am Jill Biden's husban...
1,1,Joseph R. Biden,"Thanks to the Interior Secretary, Deb Haaland,..."
2,2,Joseph R. Biden,Also thanks to the National Parks Service and ...
3,3,Joseph R. Biden,But tonight's world-renowned performers were j...
4,4,Joseph R. Biden,"Over the course of the past 99 years, Presiden..."
...,...,...,...
222589,222589,George Washington,"Such being the impressions under which I have,..."
222590,222590,George Washington,By the article establishing the executive depa...
222591,222591,George Washington,Besides the ordinary objects submitted to your...
222592,222592,George Washington,To the foregoing observations I have one to ad...


In [11]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)




Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
test = pd.read_csv('data/labeled.csv')

In [60]:
df.head(730)['text']

0      My name is Joe Biden. I am Jill Biden's husban...
1      Thanks to the Interior Secretary, Deb Haaland,...
2      Also thanks to the National Parks Service and ...
3      But tonight's world-renowned performers were j...
4      Over the course of the past 99 years, Presiden...
                             ...                        
725    So, you know, they also serve who only "stand ...
726    And you do so much, and your families give so ...
727    And you're the finest -- you're the finest mil...
728    And it's hard for me to even say it, but I'm -...
729    And so, I just -- we came because we wanted to...
Name: text, Length: 730, dtype: object

In [83]:
set(df.head(730)['text']).difference(set(test['excerpt']))

{"And so, I just -- we came because we wanted to thank you and tell you how much we care. And we wanted you to hear the engine of Air Force One, so you couldn't hear anything. But -- (laughter) -- that was the main reason we did that."}

(array([   729,    730,    731, ..., 222591, 222592, 222593]),)

In [115]:
current


Unnamed: 0,speaker,excerpt,label,score
0,Joseph R. Biden,My name is Joe Biden. I am Jill Biden's husban...,positive,0.7807849049568176
1,Joseph R. Biden,"Thanks to the Interior Secretary, Deb Haaland,...",positive,0.9607234001159668
2,Joseph R. Biden,Also thanks to the National Parks Service and ...,positive,0.9605549573898315
3,Joseph R. Biden,But tonight's world-renowned performers were j...,positive,0.9793055653572083
4,Joseph R. Biden,"Over the course of the past 99 years, Presiden...",positive,0.8555593490600586
...,...,...,...,...
28781,Donald J. Trump,"So they don't want anything. Now, they're gett...",negative,0.8917644619941711
28782,Donald J. Trump,Crowd Member. Nice work!,positive,0.9758639931678772
28783,Donald J. Trump,The President. You're not gonna win too many p...,positive,0.4009448289871216
28784,Donald J. Trump,"They said, ""No, don't do that, favored nations...",neutral,0.536845326423645


In [120]:
def get_remaining(labeled, cleaned):
    # Convert the 'excerpt' column of the labeled DataFrame to a set for faster lookup
    labeled_set = set(labeled['excerpt'])
    
    # Use the 'isin' method to check if each element in 'text' column is in the 'labeled_set'
    missing_mask = ~cleaned['text'].isin(labeled_set)
    
    # Use the boolean mask to filter out the missing rows
    remaining_rows = cleaned[missing_mask]
    
    return remaining_rows

In [131]:
import os
import sys


import pandas as pd
df = pd.read_csv("data/cleaned.csv")

if 'labeled.csv' in os.listdir('data'):
    current = pd.read_csv('data/labeled.csv')


import pandas as pd



df = get_remaining(current, df)

print(f'{len(df)} entries remaining')


results = []

for index, row in df.iterrows():
    try:
        speaker = row['speaker']
        excerpt = row['text']
        out = sentiment_task(excerpt)[0]
        label = out['label']
        score = out['score']
        results.append({"speaker":speaker, "excerpt":excerpt, "label":label, "score":score})
    except KeyboardInterrupt as e:
        constructed = pd.DataFrame(results)
        if 'labeled.csv' in os.listdir('data'):
            to_save = pd.concat([current, constructed])
            to_save.to_csv('data/labeled.csv', index = False)
        else:
            constructed.to_csv('data/labeled.csv', index = False)
        sys.exit()
    except Exception as ex:
        print(ex)
        results.append({"speaker":speaker, "excerpt":excerpt, "label":ex, "score":ex})

51155 entries remaining
The expanded size of the tensor (549) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 549].  Tensor sizes: [1, 514]
The expanded size of the tensor (680) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 680].  Tensor sizes: [1, 514]
The expanded size of the tensor (687) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 687].  Tensor sizes: [1, 514]
The expanded size of the tensor (648) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 648].  Tensor sizes: [1, 514]
The expanded size of the tensor (691) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 691].  Tensor sizes: [1, 514]
The expanded size of the tensor (541) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 541].  Tensor sizes: [1, 514]
The expanded size of the tensor (566) must match the existing si

In [132]:
constructed = pd.DataFrame(results)
if 'labeled.csv' in os.listdir('data'):
    to_save = pd.concat([current, constructed])
    to_save.to_csv('data/labeled.csv', index = False)