In [226]:
from llama2_assets import prompt_llama2
import transformers
import pandas as pd

In [44]:
df = pd.read_csv("data/results.csv", names = ['index','number','president','speech_title','website','speech'])

In [45]:
from bs4 import BeautifulSoup

def remove_html_tags(input_string):
    soup = BeautifulSoup(input_string, "html.parser")
    cleaned_string = soup.get_text()
    if cleaned_string[-2:] == ", ":
        return cleaned_string[:-2]
    else:
        return cleaned_string


In [89]:
df['speech'][5500]

'[<p>PRESIDENT LOPEZ PORTILLO. Mr. President of the United States of America, James Carter, Mrs. Carter, ladies and gentlemen:</p>, <p>It has been 2 years now since we met for the first time. Since then, a great deal of water has flowed under the bridges of the Rio Grande. A great deal has also happened within our countries and between our countries, as it has in the world and to the world.</p>, <p>United by geography and the borders that scar it, and immersed in the conflicts of a sometimes bitter and invariably complex history, we have set out to order our conduct as neighbors according to the precepts of law and, even more important, on the favorable basis of mutual friendship. This implies good will, expressed in terms of respectful, fair, and worthy treatment.</p>, <p>Today, 2 years later, it is only fitting that we evaluate our objectives and face the facts that confront us. We know better now what each expects of the other, but I believe we also know that we have not yet put our

In [90]:
import re
def ensure_correct_speaker(cleaned_string):
    matches = re.findall(r'[A-Z ]', cleaned_string)
    if matches:
        if len(matches) == 1:
            matches = matches[0]
            if "PRESIDENT:" in matches:
                return cleaned_string.replace(matches+" ","")
    else:
        return cleaned_string


In [173]:
def ensure_no_other_speakers(speech):
    matches = re.findall(r'\b[A-Z\s]+\b[:.]', speech)
    if len(matches) <= 1:
        if matches: 
            speaker = matches[0]
            if "PRESIDENT" in speaker: 
                return speech.replace(speaker,'')[1:] 
        else:
            return speech


In [174]:
ensure_no_other_speakers("PRESIDENT: Thanks")

'Thanks'

In [175]:
ensure_no_other_speakers("MARCY. Thank you.")

In [176]:
def parse_dictation(dictation):
    dictation = remove_html_tags(dictation)
    return ensure_no_other_speakers(dictation)

In [227]:
df = df[['president','speech_title','speech']]

samples = []
for index, row in df.iterrows():
    speech = [x for x in row['speech'].replace("[","").replace("[","").split("<p>") if x != '']
    name = row['president']
    for dictation in speech:
        parsed_dictation = parse_dictation(dictation)
        if parsed_dictation:
            samples.append({"speaker":name, "text":parsed_dictation})

pd.DataFrame(samples).to_csv('data/cleaned.csv')

KeyboardInterrupt: 

In [251]:
def assemble_prompt(text, speaker):
    prompt = f"""You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.\n\nSpeaker: {speaker}\n\nExcerpt:\n{text}\n\nIf the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE."""
    return prompt

In [252]:
from numpy import random

data = []
for i in range(2):
    text = random.choice(samples)
    excerpt = text["text"]
    speaker = text["speaker"]
    prompt = assemble_prompt(excerpt, speaker)
    print(prompt)
    response = prompt_llama2(prompt)
    data.append({"text":excerpt, "speaker":speaker,"sentiment":response})


You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.

Speaker: Donald J. Trump

Excerpt:
But oil is getting to a point where, I mean, there are some areas—some people would say the water is more valuable than the oil. You never thought you're going to see that. You never thought having covered—I know you cover it—you never thought you'd be seeing oil at $20 a barrel, but how about $10 a barrel? That you never thought you'd see.


If the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE.
You are a helpful assistant that helps analyze the sentiment of US presidents. Below is an excerpt from a speech by a US president.

Speaker: Mike Pence

Excerpt:
It's amazing.


If the sentiment of the excerpt is negative, reply with NEG. Otherwise reply with NONE.


In [253]:
data

[{'text': "But oil is getting to a point where, I mean, there are some areas—some people would say the water is more valuable than the oil. You never thought you're going to see that. You never thought having covered—I know you cover it—you never thought you'd be seeing oil at $20 a barrel, but how about $10 a barrel? That you never thought you'd see.\n",
  'speaker': 'Donald J. Trump',
  'sentiment': 'The sentiment of the excerpt is NONE because it is neutral. It simply states a fact about the price of oil and does not express any emotion or opinion.'},
 {'text': "It's amazing.\n", 'speaker': 'Mike Pence', 'sentiment': 'NONE'}]

In [81]:
prompt_llama2(prompt+"This country is a mess")

'NEG'

In [1]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
sentiment_task("Covid cases are increasing fast!")


  from .autonotebook import tqdm as notebook_tqdm
Downloading pytorch_model.bin: 100%|██████████| 501M/501M [00:25<00:00, 19.8MB/s]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 11.2MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [

[{'label': 'negative', 'score': 0.7235761880874634}]