In [6]:
# For sanity only
import warnings
warnings.filterwarnings('ignore')

In [7]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [8]:
import context_veracity
def context_veracity_score(content):
    content_concat = " ".join(content)
    sent_score = context_veracity.sentiment_shift(content_concat)
    topic_score = context_veracity.topic_shift(content_concat)
    ner_score = context_veracity.ner_shift(content_concat)
    context_veracity_score = context_veracity.calculate_contextual_drift(sent_score, topic_score, ner_score)
    return context_veracity_score
# print(f"The context veracity score of the article is {context_veracity_score(content)}")

2024-03-05 17:58:03.345235: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
import political_bias
import numpy as np
def political_bias_score(content):
    pred_labels = []
    political_bias.download_pretrained_model()
    for paragraph in content:
        processed_article = political_bias.preprocess_article(header, paragraph)
        label = political_bias.predict_label(processed_article)
        pred_labels.append(label[0])
    poli_bias_score = np.mean(pred_labels)
    return poli_bias_score
# print("The political bias score of the article is ", political_bias_score(content))

[nltk_data] Downloading package stopwords to /home/zhj003/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zhj003/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
# from sentiment import SentimentModel
# import pickle
# import numpy as np
# def sentiment_score(content):
#     with open('models/sentiM.pkl', 'rb') as f:
#         sentM = pickle.load(f)
#     pred_labels = []
#     for paragraph in content:
#         label = sentM.predict_article(header, paragraph)[0]
#         pred_labels.append(label)
#     sent_score = np.mean(pred_labels)
#     return sent_score
# # print("The sentiment score of the article is ", sentiment_score(content))

In [11]:
import torch
from transformers import pipeline
def sentiment_score(content):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    distilled_student_sentiment_classifier = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        return_all_scores=False,
        device=device
    )
    pred_labels = []
    for paragraph in content:
        result = distilled_student_sentiment_classifier(paragraph)[0]['label']
        if result == 'positive':
            pred_labels.append(0)
        elif result == 'negative':
            pred_labels.append(2)
        else:
            pred_labels.append(1)
    return np.mean(pred_labels)

In [12]:
import html
import re
import credibility
import numpy as np
import pandas as pd
import pickle
def credibility_score(authors):
    with open('models/credibility_model.pkl', 'rb') as f:
        cred_model = pickle.load(f)

    search_results = []
    for author in authors:
        search_results.append(credibility.search_wikipedia(author, num_results=15))

    search_pd = pd.DataFrame(search_results, columns=['text'])
    embedded_result = credibility.text_embedding(search_pd['text'])[:, :50]
    cred_scores = cred_model.predict(embedded_result)
    if len(cred_scores) == 1:
        print(f'The score of {authors[0]} is {cred_scores[0]}')
        return cred_scores[0]
    else:
        for i in range(len(authors)):
            print(f'The score of {authors[i]} is {cred_scores[i]}')
        cred_score = np.mean(cred_scores)
        return cred_score
# print(f'The credibility score is {credibility_score(authors)}')

In [13]:
import text_manipulation
import numpy as np
def style_score(content):
    text_manipulation.download_pretrained_model()
    pred_labels = []
    for paragraph in content:
        label = text_manipulation.predict(paragraph)
        pred_labels.append(label)
    style_score = np.mean(pred_labels)
    return style_score
# print("The text manipulation (style) score of the article is ", style_score(content))

In [14]:
from spam import SpamModel
import pickle
def spam_score(header):
    with open('models/spamM.pkl', 'rb') as f:
        spamM = pickle.load(f)
    pred_label = spamM.predict_article(header)[0]
    if pred_label:
        spam_score = 1
    else:
        spam_score = 0
    return spam_score
# print("The spam score of the article headline is ", spam_score(header))

In [15]:
def source_reliability_score(content):
    with open('models/srcM.pkl', 'rb') as f:
        srcM = pickle.load(f)
    pred_labels = []
    for paragraph in content:
        label = srcM.predict_text(paragraph)[0]
        pred_labels.append(label)
    return np.mean(pred_labels)

In [16]:
def clickbait_score(header):
    with open('models/clickM.pkl', 'rb') as f:
        clickM = pickle.load(f)
    label, proba = clickM.predict_text(header)
    return label[0]

In [29]:
import pandas as pd
df = pd.read_csv('original_files/politifact_data_combined.csv')

# df = df.drop(columns = [0])
# df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
#            7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
#            11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
#            15: 'justification'
#           }, axis = 1, inplace = True)

In [18]:
label_map = {'pants-fire': 5, 'false': 4, 'half-true': 3,
             'barely-true': 2, 'mostly-true': 1, 'true': 0}
df['documented_time'] = pd.to_datetime(df['documented_time'])
df = df[~df['label'].isin({'full-flop', 'half-flip', 'no-flip'})]
df['label'] = df['label'].replace(label_map)
df = df[df['documented_time'].dt.year >= 2022]

In [19]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

scores = []
for index, row in df.iterrows():
    content = [row['content']]
    authors = [row['speaker']]
    try:
        pb_score = political_bias_score(content)
    except:
        pb_score = None
    try:
        sent_score = sentiment_score(content)
    except:
        sent_score = None
    try:
        cred_score = credibility_score(authors)
    except:
        cred_score = None
    try:
        sty_score = style_score(content)
    except:
        style_score = None
    try:
        reliability_score = source_reliability_score(content)
    except:
        reliability_score = None
    scores.append([pb_score, sent_score, cred_score, sty_score, reliability_score])
    if index % 50 == 0:
        clear_output(wait=True)
        print(f"Running at iteration {index}")
    

Running at iteration 3050
File 'models/poli_bias_bert.pkl' already exists. No download needed.
The score of Tom Kertscher is 4.742999999999999
File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
scores_df = pd.DataFrame(scores, columns=['Political_Bias', 'Sentiment', 'Credibility', 'Style', 'Reliability'])
df = pd.concat([df, scores_df], axis=1)

In [21]:
df.to_csv('original_files/politifact_data_2022_score.csv', index=False)

In [24]:
min_val, max_val = 4, 6

def normalization(score):
    if max_val - min_val == 0:  # Check for zero division
        return 0
    elif score > max_val:
        return 1
    else:
        return (score - min_val) / (max_val - min_val)
normalization(5)

0.5