In [1]:
# For sanity only
import warnings
warnings.filterwarnings('ignore')

In [2]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [3]:
import context_veracity
def context_veracity_score(content):
    content_concat = " ".join(content)
    sent_score = context_veracity.sentiment_shift(content_concat)
    topic_score = context_veracity.topic_shift(content_concat)
    ner_score = context_veracity.ner_shift(content_concat)
    context_veracity_score = context_veracity.calculate_contextual_drift(sent_score, topic_score, ner_score)
    return context_veracity_score
# print(f"The context veracity score of the article is {context_veracity_score(content)}")

2024-02-20 07:07:25.544374: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import political_bias
import numpy as np
def political_bias_score(content):
    pred_labels = []
    political_bias.download_pretrained_model()
    for paragraph in content:
        processed_article = political_bias.preprocess_article(header, paragraph)
        label = political_bias.predict_label(processed_article)
        pred_labels.append(label[0])
    poli_bias_score = np.mean(pred_labels)
    return poli_bias_score
# print("The political bias score of the article is ", political_bias_score(content))

[nltk_data] Downloading package stopwords to /home/zhj003/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zhj003/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
from sentiment import SentimentModel
import pickle
import numpy as np
def sentiment_score(content):
    with open('models/sentiM.pkl', 'rb') as f:
        sentM = pickle.load(f)
    pred_labels = []
    for paragraph in content:
        label = sentM.predict_article(header, paragraph)[0]
        pred_labels.append(label)
    sent_score = np.mean(pred_labels)
    return sent_score
# print("The sentiment score of the article is ", sentiment_score(content))

In [6]:
import html
import re
import credibility
import numpy as np
import pandas as pd
import pickle
def credibility_score(authors):
    with open('models/credibility_model.pkl', 'rb') as f:
        cred_model = pickle.load(f)

    search_results = []
    for author in authors:
        search_results.append(credibility.search_wikipedia(author, num_results=15))

    search_pd = pd.DataFrame(search_results, columns=['text'])
    embedded_result = credibility.text_embedding(search_pd['text'])[:, :50]
    cred_scores = cred_model.predict(embedded_result)
    if len(cred_scores) == 1:
        print(f'The score of {authors[0]} is {cred_scores[0]}')
        return cred_scores[0]
    else:
        for i in range(len(authors)):
            print(f'The score of {authors[i]} is {cred_scores[i]}')
        cred_score = np.mean(cred_scores)
        return cred_score
# print(f'The credibility score is {credibility_score(authors)}')

In [14]:
import text_manipulation
import numpy as np
def style_score(content):
    text_manipulation.download_pretrained_model()
    pred_labels = []
    for paragraph in content:
        label = text_manipulation.predict(paragraph)
        pred_labels.append(label)
    style_score = np.mean(pred_labels)
    return style_score
# print("The text manipulation (style) score of the article is ", style_score(content))

In [8]:
from spam import SpamModel
import pickle
def spam_score(header):
    with open('models/spamM.pkl', 'rb') as f:
        spamM = pickle.load(f)
    pred_label = spamM.predict_article(header)[0]
    if pred_label:
        spam_score = 1
    else:
        spam_score = 0
    return spam_score
# print("The spam score of the article headline is ", spam_score(header))

In [9]:
import pandas as pd
df = pd.read_csv('original_files/test2.tsv', delimiter='\t', header = None)

df = df.drop(columns = [0])
df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
           7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
           11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
           15: 'justification'
          }, axis = 1, inplace = True)

In [10]:
label_map = {'pants-fire': 5, 'false': 4, 'half-true': 3,
             'barely-true': 2, 'mostly-true': 1, 'true': 0}

df['label'] = df['label'].replace(label_map)

In [18]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

scores = []
for index, row in df.iterrows():
    content = [row['statement']]
    authors = [row['speaker']]
    try:
        pb_score = political_bias_score(content)
    except:
        pb_score = None
    try:
        sent_score = sentiment_score(content)
    except:
        sent_score = None
    try:
        cred_score = credibility_score(authors)
    except:
        cred_score = None
    try:
        sty_score = style_score(content)
    except:
        style_score = None
    scores.append([pb_score, sent_score, cred_score, sty_score])
      
    if index % 50 == 0:
        clear_output(wait=True)
        print(f"Running at iteration {index}")

Running at iteration 1250
File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of mike-pence is 5.150200000000001
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of rand-paul is 5.494199999999999
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of don-balfour is 5.2904
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of hugh-fitzsimons is 5.374599999999999
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of donald-trump is 5.273400000000001
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of barbara-buono is 5.717200000000002
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of chain-email is 5.1048
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of barack-obama is 5.6758000000000015
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of hillary-clinton is 5.1206
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of rick-santorum is 4.9544
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of menendez-facts is 5.186200000000001
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of rick-scott is 5.626
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of jay-nixon is 5.4934
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of mackubin-thomas-owens is 5.2986
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of john-kasich is 5.356400000000001
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of john-burzichelli is 4.803000000000001
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
scores

[[1.0, 0.0, 5.0855, 1.0],
 [1.0, 0.0, 5.239599999999999, 1.0],
 [1.0, 0.0, 5.19, 1.0],
 [1.0, 0.0, 5.3351999999999995, 1.0],
 [2.0, 0.0, 4.8916, 1.0],
 [1.0, 0.0, 5.9112, 1.0],
 [1.0, 0.0, 5.323670588235293, 1.0],
 [2.0, 0.0, 5.044799999999999, 1.0],
 [2.0, 0.0, 5.016400000000001, 1.0],
 [2.0, 0.0, 5.4174000000000015, 0.0],
 [2.0, 0.0, 5.0168, 1.0],
 [2.0, 0.0, 5.0718, 1.0],
 [1.0, 0.0, 4.743400000000001, 1.0],
 [1.0, 0.0, 5.2276, 1.0],
 [1.0, 0.0, 5.610400000000001, 1.0],
 [1.0, 0.0, 4.9958, 1.0],
 [1.0, 0.0, 5.044999999999999, 1.0],
 [1.0, 0.0, 6.104000000000001, 1.0],
 [1.0, 0.0, 5.1206, 1.0],
 [2.0, 0.0, 5.540000000000001, 1.0],
 [1.0, 0.0, 5.0367999999999995, 1.0],
 [2.0, 0.0, 5.666400000000001, 1.0],
 [2.0, 0.0, 5.1188, 1.0],
 [1.0, 0.0, 4.601600000000001, 1.0],
 [1.0, 0.0, 4.9932, 1.0],
 [2.0, 0.0, 5.436199999999999, 1.0],
 [1.0, 0.0, 5.1008, 1.0],
 [1.0, 0.0, 5.0636, 1.0],
 [1.0, 0.0, 5.279000000000001, 1.0],
 [2.0, 0.0, 5.156400000000001, 1.0],
 [2.0, 0.0, 5.554800000000003, 1

In [20]:
scores_df = pd.DataFrame(scores, columns=['Political_Bias', 'Sentiment', 'Credibility', 'Style'])
df = pd.concat([df, scores_df], axis=1)

In [21]:
df.to_csv('original_files/test2_score.tsv', sep='\t', index=False)