In [22]:
import requests
import pandas as pd
import time
import os
import re
from dotenv import load_dotenv

In [11]:
token = os.getenv('HUGGINGFACE_TOKEN')

In [7]:
cwd = os.getcwd()
par_dir = os.path.dirname(cwd)

In [8]:
df = pd.read_csv(os.path.join(par_dir,'data/final_cleaned_parquet.csv'))
df.head()

Unnamed: 0,podcast_name_cleaned,segment_id,segment,trump_mention,biden_mention
0,Bill OReillys No Spin News and Analysis,0,['samsung' 'tonight' 'focusing' 'zoom' 'galaxy...,0,1
1,Bill OReillys No Spin News and Analysis,1,['biden' 'caused' 'single' 'handedly' 'issuing...,1,1
2,Bill OReillys No Spin News and Analysis,2,['march' '17th' 'total' 'black' 'blackball' 'c...,1,1
3,Bill OReillys No Spin News and Analysis,4,['commonality' 'money' 'charged' 'whatever' 'e...,1,1
4,Bill OReillys No Spin News and Analysis,5,['final' 'question' 'since' 'donald' 'trump' '...,1,1


In [9]:
df.shape

(878, 5)

In [25]:
def clean_text(text):
    clean_text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = clean_text.split()
    cleaned_words = ' '.join(words)
    return cleaned_words

df['cleaned_segment'] = df['segment'].apply(lambda x: clean_text(x))
prompt_list = df['cleaned_segment'].to_list()

# Inference Endpoint Method Deployed on AWS

In [10]:
headers = {
    'Accept': 'application/json',
    'Authorization': f'Bearer hf_{token}',
    'Content-Type': 'application/json'
}

In [11]:
model_dict = {'pretrained Llama':"https://elmtyqbmlx704v13.us-east-1.aws.endpoints.huggingface.cloud", #https://huggingface.co/meta-llama/Llama-2-7b-hf
              'pretrained Mistral':"https://b15auwvx0xu3uymo.us-east-1.aws.endpoints.huggingface.cloud", #https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
              'Llama QLORA':'https://hoeh5xdq1cy2pg01.us-east-1.aws.endpoints.huggingface.cloud', #https://huggingface.co/beraht/llama-2-7b_qlora_falcon_417
              'Llama RAFT':'https://qazfnvfi7y7n1vok.us-east-1.aws.endpoints.huggingface.cloud', #https://huggingface.co/beraht/Llama2_Falcon_RAFT_50e_10s/tree/main
              'Mistral QLORA':"https://i32y3wwlqdt9257k.us-east-1.aws.endpoints.huggingface.cloud", #https://huggingface.co/sherrys/mistral-2-7b_qlora_falcon_426/tree/main
              'Mistral RAFT':'https://ddhgi892zzbiynte.us-east-1.aws.endpoints.huggingface.cloud' #https://huggingface.co/sherrys/426_mistral_RAFT_50e_10s
              }

In [26]:
# current_model = 'Llama QLORA'
# current_model = 'pretrained Llama' #issue??
current_model = 'pretrained Mistral'
# current_model = 'Llama RAFT'
# current_model = 'Mistral QLORA'
# current_model = 'Mistral RAFT' #next

In [27]:
API_URL = model_dict[current_model]

def query(payload):
  response = requests.post(API_URL, headers=headers, json=payload)
  return response.json()

In [36]:
custom_prompt = '''Read the segment and determine the sentiment towards Trump or Biden with positive
sentiment as 1, and negative sentiment 0. Use a continuous scale depending on the strength of the sentiment,
so neutral would be 0.5. If Trump and Biden are both mentioned, only provide the rating for the stronger sentiment,
and note for whom you are giving the rating. Put your rating first, then who the rating is for, then explanation.'''

In [37]:
answers = []
answer_time = []
count = 0

for question in prompt_list:
    prompt = f"""{custom_prompt}
        Segment: {question}"""
    input_len = len(prompt.split())
    max_token_len = input_len+100+300 #100 buffer

    start_time = time.time()
    # answer = query({'inputs': f"<s>[INST] {prompt} [/INST]",
    #                 'parameters': {"max_new_tokens": max_token_len}})
    while True: #while loop for token
        answer = query({'inputs': f"<s>[INST] {prompt} [/INST]",
                    'parameters': {"max_new_tokens": max_token_len}})
        if 'error' not in answer:
            break  #exit the while loop if there is no error
        max_token_len += 100 #reduce by 100 in while loop
        print(f"Failed to process prompt with token length: {max_token_len}")
        if max_token_len <= 0:
            break
    end_time = time.time()
    duration = end_time - start_time
    
    answer = answer[0]['generated_text'].replace(f"<s>[INST] {prompt} [/INST]","")
    answer = answer.replace(" . ",". ").strip()
    answers.append(answer)
    answer_time.append(duration)
    count +=1 
    print(count)
    # print(question)  
    print('\n')
    # print(duration)
    print(answer)
    print('\n-----------------------------\n')

1


Neutral (0.5) for segment, as it does not contain any clear sentiment towards Trump or Biden. The segment primarily discusses Samsung's new Galaxy Ultra phone, Chumba Casino, and some general political commentary. There is no mention of either Trump or Biden with enough clarity or sentiment to assign a rating.

-----------------------------

2


Rating: 0.1, Biden

Explanation: The segment expresses strong negative sentiment towards Biden's handling of various issues such as crime, inflation, and immigration. The speaker also expresses frustration with the Biden administration and the Democratic party. There is no mention of Trump with a positive sentiment in this segment.

-----------------------------

3


Rating: 0.8, Trump

Explanation: The segment expresses a strong negative sentiment towards Trump's supporters and NBC's hiring of Ronna McDaniel. The text mentions Trump's victory over Hillary Clinton and his supporters' reactions to NBC's hiring of McDaniel, who is described a

In [39]:
df[f'{current_model} Output'] = answers
df[f'{current_model} Output Time'] = answer_time
df.to_csv(f'{current_model} golden answers output.csv')

In [40]:
df['text']=df[f'{current_model} Output']

In [41]:
df.columns

Index(['podcast_name_cleaned', 'segment_id', 'segment', 'trump_mention',
       'biden_mention', 'cleaned_segment', 'pretrained Mistral Output',
       'pretrained Mistral Output Time', 'text'],
      dtype='object')

In [42]:
df['rating'] = df['text'].apply(lambda x: re.search(r'(\d\.\d+)', x).group(1) if re.search(r'(\d\.\d+)', x) else None)
df['sentiment'] = df['text'].apply(lambda x: re.search(r'(Biden|Trump|Neutral|both|neutral)', x).group(1) if re.search(r'(Biden|Trump|Neutral|both|neutral)', x) else None)

In [43]:
df[['rating', 'sentiment']]

Unnamed: 0,rating,sentiment
0,0.5,Neutral
1,0.1,Biden
2,0.8,Trump
3,0.3,Trump
4,0.3,Trump
...,...,...
873,0.5,Neutral
874,0.9,Biden
875,0.6,Trump
876,,Neutral


In [13]:
df.to_csv(f'cleaned_with_mistral_labels.csv')

NameError: name 'df' is not defined