In [4]:
# For sanity only
import warnings
warnings.filterwarnings('ignore')

In [5]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [3]:
import context_veracity
def context_veracity_score(content):
    content_concat = " ".join(content)
    sent_score = context_veracity.sentiment_shift(content_concat)
    topic_score = context_veracity.topic_shift(content_concat)
    ner_score = context_veracity.ner_shift(content_concat)
    context_veracity_score = context_veracity.calculate_contextual_drift(sent_score, topic_score, ner_score)
    return context_veracity_score
print(f"The context veracity score of the article is {context_veracity_score(content)}")

2024-02-20 04:20:24.786200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[nltk_data] Downloading package words to /home/zhj003/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /home/zhj003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/zhj003/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_

The context veracity score of the article is 1.4000000000000001


In [5]:
import political_bias
import numpy as np
def political_bias_score(content):
    pred_labels = []
    political_bias.download_pretrained_model()
    for paragraph in content:
        processed_article = political_bias.preprocess_article(header, paragraph)
        label = political_bias.predict_label(processed_article)
        pred_labels.append(label[0])
    poli_bias_score = np.mean(pred_labels)
    return poli_bias_score
print("The political bias score of the article is ", political_bias_score(content))

[nltk_data] Downloading package stopwords to /home/zhj003/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zhj003/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zhj003/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Downloading...
From (original): https://drive.google.com/uc?id=1PX2zVyPMfs0v7wxRzx7w2h-yW1h1Ti7B
From (redirected): https://drive.google.com/uc?id=1PX2zVyPMfs0v7wxRzx7w2h-yW1h1Ti7B&confirm=t&uuid=330d8daf-ed8b-4ec7-b940-0e10200d6d89
To: /home/zhj003/private/DSC-180B-pipeline/models/poli_bias_bert.pkl
100%|██████████| 438M/438M [00:08<00:00, 49.7MB/s] 


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

The political bias score of the article is  1.0


In [6]:
import pickle
import numpy as np
def sentiment_score(content):
    with open('models/sentiM.pkl', 'rb') as f:
        sentM = pickle.load(f)
    pred_labels = []
    for paragraph in content:
        label = sentM.predict_article(header, paragraph)[0]
        pred_labels.append(label)
    sent_score = np.mean(pred_labels)
    return sent_score
print("The sentiment score of the article is ", sentiment_score(content))

The sentiment score of the article is  0.0


In [7]:
import html
import re
import credibility
import numpy as np
import pandas as pd
import pickle
def credibility_score(authors):
    with open('models/credibility_model.pkl', 'rb') as f:
        cred_model = pickle.load(f)

    search_results = []
    for author in authors:
        search_results.append(credibility.search_wikipedia(author, num_results=15))

    search_pd = pd.DataFrame(search_results, columns=['text'])
    embedded_result = credibility.text_embedding(search_pd['text'])[:, :50]
    cred_scores = cred_model.predict(embedded_result)
    if len(cred_scores) == 1:
        print(f'The score of {authors[0]} is {cred_scores[0]}')
        return cred_scores[0]
    else:
        for i in range(len(authors)):
            print(f'The score of {authors[i]} is {cred_scores[i]}')
        cred_score = np.mean(cred_scores)
        return cred_score
print(f'The credibility score is {credibility_score(authors)}')

The score of MJ Lee is 5.8026
The score of Michael Williams is 5.769399999999999
The score of Ted  Barrett is 5.301070588235294
The score of Donald Judd is 5.2204
The score of CNN is 4.928
The average score is 5.4042941176470585


In [4]:
import text_manipulation
import numpy as np
def style_score(content):
    text_manipulation.download_pretrained_model()
    pred_labels = []
    for paragraph in content:
        label = text_manipulation.predict(paragraph)
        pred_labels.append(label)
    style_score = np.mean(pred_labels)
    return style_score
print("The text manipulation (style) score of the article is ", style_score(content))

File 'models/txt_manipulation_model.pt' already exists. No download needed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The text manipulation (style) score of the article is  0.8666666666666667


In [8]:
from spam import SpamModel
import pickle
def spam_score(header):
    with open('models/spamM.pkl', 'rb') as f:
        spamM = pickle.load(f)
    pred_label = spamM.predict_article(header)[0]
    if pred_label:
        spam_score = 1
    else:
        spam_score = 0
    return spam_score
print("The spam score of the article headline is ", spam_score(header))

The spam score of the article headline is  1


In [23]:
import pickle

def source_reliability_score(content):
    with open('models/srcM.pkl', 'rb') as f:
        srcM = pickle.load(f)
    pred_labels = []
    for paragraph in content:
        label = srcM.predict_text(paragraph)[0]
        pred_labels.append(label)
    return np.mean(pred_labels)
print("The source reliability score of the article is ", source_reliability_score(content))

The source reliability score of the article is  2.6


In [24]:
import pickle

def clickbait_score(header):
    with open('models/clickM.pkl', 'rb') as f:
        clickM = pickle.load(f)
    label, proba = clickM.predict_text(header)
    return label[0]
print("The clickbait score of the article is ", clickbait_score(header))

The clickbait score of the article is  1
