In [None]:
# For sanity only
import warnings
warnings.filterwarnings('ignore')

In [7]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [11]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
import nltk
nltk.download('wordnet')
nltk.download('punkt')
import context_veracity
def context_veracity_score(content):
    content_concat = " ".join(content)
    sent_score = context_veracity.sentiment_shift(content_concat)
    topic_score = context_veracity.topic_shift(content_concat)
    ner_score = context_veracity.ner_shift(content_concat)
    context_veracity_score = context_veracity.calculate_contextual_drift(sent_score, topic_score, ner_score)
    return context_veracity_score
print(f"The context veracity score of the article is {context_veracity_score(content)}")

[nltk_data] Downloading package wordnet to /Users/seanj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/seanj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/seanj/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/seanj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The context veracity score of the article is 1.4000000000000001


[nltk_data] Downloading package words to /Users/seanj/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/seanj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/seanj/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/seanj/nltk_data...
[nltk_data]   Package averaged_

In [None]:
import political_bias
def political_bias_score(content):
    pred_labels = []
    political_bias.download_pretrained_model()
    for paragraph in content:
        processed_article = political_bias.preprocess_article(header, paragraph)
        label = political_bias.predict_label(processed_article)
        pred_labels.append(label[0])
    poli_bias_score = np.mean(pred_labels)
    return poli_bias_score
print("The political bias score of the article is ", political_bias_score(content))

In [None]:
# import pickle
# def sentiment_score(content):
#     with open('models/sentiM.pkl', 'rb') as f:
#         sentM = pickle.load(f)
#     pred_labels = []
#     for paragraph in content:
#         label = sentM.predict_article(header, paragraph)[0]
#         pred_labels.append(label)
#     sent_score = np.mean(pred_labels)
#     return sent_score
# print("The sentiment score of the article is ", sentiment_score(content))

In [None]:
from transformers import pipeline
import torch
def sentiment_score(content):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    distilled_student_sentiment_classifier = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        return_all_scores=False,
        device=device
    )
    pred_labels = []
    for paragraph in content:
        result = distilled_student_sentiment_classifier(paragraph)[0]['label']
        if result == 'positive':
            pred_labels.append(0)
        elif result == 'negative':
            pred_labels.append(2)
        else:
            pred_labels.append(1)
    return np.mean(pred_labels)
print("The sentiment score of the article is ", sentiment_score(content))            

In [6]:
import html
import re
import credibility
import pandas as pd
import pickle
def credibility_score(authors):
    with open('models/credibility_model.pkl', 'rb') as f:
        cred_model = pickle.load(f)

    search_results = []
    for author in authors:
        search_results.append(credibility.search_wikipedia(author, num_results=15))

    search_pd = pd.DataFrame(search_results, columns=['text'])
    embedded_result = credibility.text_embedding(search_pd['text'])[:, :50]
    cred_scores = cred_model.predict(embedded_result)
    if len(cred_scores) == 1:
        print(f'The score of {authors[0]} is {cred_scores[0]}')
        return cred_scores[0]
    else:
        for i in range(len(authors)):
            print(f'The score of {authors[i]} is {cred_scores[i]}')
        cred_score = np.mean(cred_scores)
        return cred_score
print(f'The credibility score is {credibility_score(authors)}')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


The score of MJ Lee is 5.607
The score of Michael Williams is 5.877399999999999
The score of Ted  Barrett is 5.3252
The score of Donald Judd is 5.363199999999999
The score of CNN is 4.842
The credibility score is 5.402959999999999


In [None]:
import text_manipulation
def style_score(content):
    text_manipulation.download_pretrained_model()
    pred_labels = []
    for paragraph in content:
        label = text_manipulation.predict(paragraph)
        pred_labels.append(label)
    style_score = np.mean(pred_labels)
    return style_score
print("The text manipulation (style) score of the article is ", style_score(content))

In [None]:
from spam import SpamModel
import pickle
def spam_score(header):
    with open('models/spamM.pkl', 'rb') as f:
        spamM = pickle.load(f)
    pred_label = spamM.predict_article(header)[0]
    if pred_label:
        spam_score = 1
    else:
        spam_score = 0
    return spam_score
print("The spam score of the article headline is ", spam_score(header))

In [None]:
import pickle

def source_reliability_score(content):
    with open('models/srcM.pkl', 'rb') as f:
        srcM = pickle.load(f)
    pred_labels = []
    for paragraph in content:
        label = srcM.predict_text(paragraph)[0]
        pred_labels.append(label)
    return np.mean(pred_labels)
print("The source reliability score of the article is ", source_reliability_score(content))

In [None]:
import pickle

def clickbait_score(header):
    with open('models/clickM.pkl', 'rb') as f:
        clickM = pickle.load(f)
    label, proba = clickM.predict_text(header)
    return label[0]
print("The clickbait score of the article is ", clickbait_score(header))