## Text analysis

1. Determine bias of the input article
2. Determine direction (left/right)





BIAS DETECTION:
    - detect biased words, compute an index
    - Get sentiment on various topics, known to differ from left to right political view (sentiment analysis on paragraphs)

In [81]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [82]:
# Read in the sample article
article = open('sample_article.txt', 'r').read()

# Convert to single long string
article = article.replace('\'', ' ')

In [83]:
article

'Virginia mom says activists threatened to  curb stomp  her for speaking out against school curriculum\n This is murder,  Loudoun County parent Alisha Brand told  Fox & Friends \n\nParents in Loudoun County, Virginia, say they have become targets of violent threats on social media for speaking out against the school district s LGBTQ agenda.\n\n"They said that they wanted to curb stomp me," Loudoun County mom and Army of Parents co-founder Alisha Brand told "Fox & Friends" Friday of threats made on the "Loudoun Love Warriors" Facebook page.\n\nBrand said her organization, which she described as a 501(c)(4) that advocates for excellence in education, school safety and parental rights, has made activists online "very angry," leading to "threats of death."\n\n"I m not sure if your audience is aware of the violent nature of curb stomping, but what it does entail is grabbing somebody by the back of the head, forcing their mouth open, pushing them down to the ground with their face on the cem

### Preprocess text

In [84]:
# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [92]:
def preprocess_text(text):

    # Remove stopwords
    tokens = word_tokenize(text)
    tokens = re.sub(r'[^\w\s]', '', " ".join(tokens))
    tokens = tokens.split(" ")
    stop_words = stopwords.words('english')
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and len(token) > 0]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [93]:
lemmatized_tokens = preprocess_text(article)

#### Test sentiment analysis with chatGPT API

In [87]:
import openai

In [88]:
openapi_key = open("openai_api_key", "r").read().strip()
openai.api_key = openapi_key

In [95]:
def get_sentiment(topic: str, text: str) -> str:
    
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"Decide whether text's sentiment is positive, neutral, or negative based on {topic}.\n\n text: {text} \"\nSentiment:",
    temperature=0,
    max_tokens=100,
    top_p=1.0,
    frequency_penalty=0.5,
    presence_penalty=0.0
    )
    
    return response

In [96]:
lemmatized_tokens

['virginia',
 'mom',
 'say',
 'activist',
 'threatened',
 'curb',
 'stomp',
 'speaking',
 'school',
 'curriculum',
 'murder',
 'loudoun',
 'county',
 'parent',
 'alisha',
 'brand',
 'told',
 'fox',
 'friend',
 'parent',
 'loudoun',
 'county',
 'virginia',
 'say',
 'become',
 'target',
 'violent',
 'threat',
 'social',
 'medium',
 'speaking',
 'school',
 'district',
 'lgbtq',
 'agenda',
 'said',
 'wanted',
 'curb',
 'stomp',
 'loudoun',
 'county',
 'mom',
 'army',
 'parent',
 'cofounder',
 'alisha',
 'brand',
 'told',
 'fox',
 'friend',
 'friday',
 'threat',
 'made',
 'loudoun',
 'love',
 'warrior',
 'facebook',
 'page',
 'brand',
 'said',
 'organization',
 'described',
 '501',
 'c',
 '4',
 'advocate',
 'excellence',
 'education',
 'school',
 'safety',
 'parental',
 'right',
 'made',
 'activist',
 'online',
 'angry',
 'leading',
 'threat',
 'death',
 'sure',
 'audience',
 'aware',
 'violent',
 'nature',
 'curb',
 'stomping',
 'entail',
 'grabbing',
 'somebody',
 'back',
 'head',
 'forci

In [105]:
sentiment = get_sentiment("right wing agenda", " ".join(lemmatized_tokens))

In [106]:
sentiment

<OpenAIObject text_completion id=cmpl-7FjbB4m09ldDxnDPOj93bvxEiKrLg at 0x1d6f6124220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " Negative"
    }
  ],
  "created": 1683984557,
  "id": "cmpl-7FjbB4m09ldDxnDPOj93bvxEiKrLg",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 1,
    "prompt_tokens": 427,
    "total_tokens": 428
  }
}

In [102]:
 " ".join(lemmatized_tokens[:50])

'virginia mom say activist threatened curb stomp speaking school curriculum murder loudoun county parent alisha brand told fox friend parent loudoun county virginia say become target violent threat social medium speaking school district lgbtq agenda said wanted curb stomp loudoun county mom army parent cofounder alisha brand told fox friend'

#### Flag text based on words

In [133]:
bias_words = open("biasDataset.csv", "r").read().split("\n")
bias_words = list(map(lambda x: x.lower(), bias_words))
bias_words.remove('')

In [147]:
# Search for occurences in the original article
bias_words_in_article = list(filter(lambda x: x.strip() in article.lower().strip(), bias_words))
preprocessed = preprocess_text(" ".join(bias_words))
bias_words_in_article_processed = list(filter(lambda x: x.strip() in " ".join(lemmatized_tokens), bias_words))

# Create union
bias_words_in_article = list(set(bias_words_in_article + bias_words_in_article_processed))
bias_words_in_article = list(map(lambda x: x.strip(), bias_words_in_article))

In [148]:
bias_words_in_article

['forcing', 'raged', 'investigation']

In [123]:
bias_words_in_article

['raged', 'forcing', 'investigation ', '']

In [195]:
import requests

url_embedded = "https://openai-api.meetings.bio/api/openai/embeddings"
url_completion = "https://openai-api.meetings.bio/api/openai/chat/completions"
model = "gpt-3.5-turbo" #"text-embedding-ada-002" # 
token = "MolDNdTf1iTLl4aWEe1eBgYOtecJ5m"#open("gpt4_token", "r").read().strip()

class GPT4:
   def __init__(self, url, token, model="gpt-3.5-turbo"):
      self.model = model
      self.url = url
      self.token = token
      
   def post_request(self, prompt, role="user"):
      
      if model == "gpt-3.5-turbo":
         response = requests.post(
            self.url,
            headers={"Authorization": f"Bearer {self.token}"},
            json={
             "model": model,
             "messages": [{"role": "user", "content": prompt}],
            },
         )
      else:
         response = requests.post(
            self.url,
            headers={"Authorization": f"Bearer {self.token}"},
           json={
               "model": model,
               "input": prompt,
            },
         )
   
      return response

   def print_response(self, response):
      if response.ok:
         if model == "gpt-3.5-turbo":
            print(response.json()["choices"][0]["message"]["content"])
         else:
            print(response.json())
      else:
         print(response)


In [196]:
gpt = GPT4(url_completion, token, model)
res = gpt.post_request(f"Determine the political bias of the following text - return a score on the interval [-1, 1], where -1 means strong leftism and 1 means strong rightism: {' '.join(lemmatized_tokens)}")
gpt.print_response(res)

<Response [400]>


In [181]:
print(" ".join(lemmatized_tokens[:50]))

virginia mom say activist threatened curb stomp speaking school curriculum murder loudoun county parent alisha brand told fox friend parent loudoun county virginia say become target violent threat social medium speaking school district lgbtq agenda said wanted curb stomp loudoun county mom army parent cofounder alisha brand told fox friend
