In [None]:
# For sanity only
import warnings
warnings.filterwarnings('ignore')

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [None]:
import context_veracity
content_concat = " ".join(content)
sent_score = context_veracity.sentiment_shift(content_concat)
topic_score = context_veracity.topic_shift(content_concat)
ner_score = context_veracity.ner_shift(content_concat)
context_veracity_score = context_veracity.calculate_contextual_drift(sent_score, topic_score, ner_score)
print(f"The context veracity score of the article is {context_veracity_score}")

In [None]:
import political_bias
import numpy as np
pred_labels = []
for paragraph in content:
    processed_article = political_bias.preprocess_article(header, paragraph)
    label = political_bias.predict_label(processed_article)
    pred_labels.append(label[0])
print("The political bias score of the article is ", np.mean(pred_labels))

In [None]:
from sentiment import SentimentModel
import pickle
with open('models/sentiM.pkl', 'rb') as f:
    sentM = pickle.load(f)
pred_labels = []
for paragraph in content:
    label = sentM.predict_article(header, paragraph)[0]
    pred_labels.append(label)
print("The sentiment score of the article is ", np.mean(pred_labels))

In [None]:
import html
import re
from credibility import search_wikipedia, text_embedding, preprocess_text
import numpy as np
import pandas as pd
import pickle
with open('models/credibility_model.pkl', 'rb') as f:
    cred_model = pickle.load(f)

search_results = []
for author in authors:
    search_results.append(search_wikipedia(author, num_results=15))

search_pd = pd.DataFrame(search_results, columns=['text'])
embedded_result = text_embedding(search_pd['text'])[:, :50]
cred_scores = cred_model.predict(embedded_result)
if len(cred_scores) == 1:
    print(f'The score of {authors[0]} is {cred_scores[0]}')
else:
    for i in range(len(authors)):
        print(f'The score of {authors[i]} is {cred_scores[i]}')
    print(f'The average score is {np.mean(cred_scores)}')