# Truth Hunter

This Jupyter Notebook contains Python code that aims to fact-check an article using similar articles. The code uses various Python libraries such as NLTK, TensorFlow, and others for text processing, web scraping, and machine learning tasks. Below are the steps involved:

In [35]:

# Import required libraries
import requests
import time
from bs4 import BeautifulSoup, NavigableString, Comment
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from urllib.parse import urlparse
import numpy as np
import re  # Added for better text sanitization
import tensorflow_hub as hub


In [36]:

# Cache dictionary to store titles and content for URLs
url_cache = {}
content_cache = {}

# Load Sentence Embedding Model
embed = hub.load("/Users/salmanshanavas/universal-sentence-encoder")


In [37]:
# This function takes a list of sentences and returns their Universal Sentence Encoder embeddings.
def get_universal_sentence_embeddings(sentences):
    return embed(sentences).numpy()

In [38]:
# Given a URL, this function fetches the full HTML content of the webpage. It returns the HTML as a text string.
def get_full_html_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"
    finally:
        time.sleep(1)

In [39]:
# This function generates HTML code to highlight sentences within an article that may be misleading.
def highlight_sentence(sentence, soup_element, similar_article_links=[]):
    highlight_style = "background-color: yellow;"
    popup_content = f"<strong>Misleading Info</strong><br>Please visit:<br>{'<br>'.join([f'<a href={link}>{link}</a>' for link in similar_article_links])}"
    highlighted_text = f'<span class="highlighted" style="{highlight_style}" data-popup-content="{popup_content}">{sentence}</span>'
    new_content = BeautifulSoup(highlighted_text, 'html.parser')
    return new_content if new_content else None

In [40]:
#This function takes the HTML of an article and a list of discrepancies. It returns the HTML with the discrepancies highlighted and a popup containing similar articles.
def highlight_discrepancies_in_html(original_html, discrepancies, similar_articles):
    # Parse the original HTML content
    soup = BeautifulSoup(original_html, 'html.parser')
    
    # Add CSS styles for the hover-over popup
    css_style = """
    .highlighted:hover .popup {
        display: block;
    }
    .popup {
        display: none;
        position: absolute;
        background-color: #f9f9f9;
        border: 1px solid #ccc;
        border-radius: 4px;
        box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
        z-index: 1;
        padding: 10px;
        font-size: small;
    }
    """
    style_tag = soup.new_tag("style", type="text/css")
    style_tag.string = css_style
    soup.head.append(style_tag)

    # Add JavaScript for the popup
    js_script = """
    document.addEventListener("DOMContentLoaded", function() {
        var highlights = document.querySelectorAll(".highlighted");
        highlights.forEach(function(highlight) {
            var popupContent = highlight.getAttribute("data-popup-content");
            var popup = document.createElement("div");
            popup.className = "popup";
            popup.innerHTML = popupContent;
            highlight.appendChild(popup);
        });
    });
    """
    script_tag = soup.new_tag("script", type="text/javascript")
    script_tag.string = js_script
    soup.body.append(script_tag)

    # Highlight discrepancies in the article
    for element in soup.find_all(string=True):
        if not isinstance(element, NavigableString):
            continue
        for sentence in discrepancies:
            if sentence in element.string:
                parent = element.find_parent()
                similar_article_links = [article['url'] for article in similar_articles[:3]]
                new_element = highlight_sentence(sentence, element.string, similar_article_links)
                if new_element and parent:
                    new_soup_element = BeautifulSoup(str(new_element), 'html.parser')
                    element.replace_with(new_soup_element)
                    
    # Save the modified HTML content
    html_path = 'highlighted_article.html'
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(str(soup))
    
    return html_path

In [41]:
# Validates the format of a URL.
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

In [42]:
# Fetches and returns the title of the article located at a given URL.
def get_title_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    if url in url_cache:
        return url_cache[url]

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content_type = response.headers.get('Content-Type')
        if 'text' not in content_type:
            return "Content is not textual"

        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title
        if title:
            cleaned_title = title.string.strip()
            cleaned_title = cleaned_title.split("|")[0].strip()
            url_cache[url] = cleaned_title
            return cleaned_title
        else:
            return "No title found"
    
    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

    finally:
        time.sleep(1)

In [43]:

# Fetches and returns the main content of the article at a given URL.
def get_content_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    if url in content_cache:
        return content_cache[url]

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content_type = response.headers.get('Content-Type')
        if 'text' not in content_type:
            return "Content is not textual"

        soup = BeautifulSoup(response.content, 'html.parser')
        for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'aside']):
            element.extract()
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        main_content = ''
        tags_to_check = ['article', 'div', 'p', 'section']
        for tag_name in tags_to_check:
            tags = soup.find_all(tag_name)
            for tag in tags:
                tag_text = tag.text.strip()
                if len(tag_text.split()) > len(main_content.split()):
                    main_content = tag_text

        if not main_content:
            return "Error: Unable to extract article content."

        content_cache[url] = main_content
        return main_content

    except requests.RequestException as e:
        return f"Error fetching URL: {e}"

    finally:
        time.sleep(1)

In [44]:

# Takes an article title and refines it to form a query string that can be used to find similar articles.
def refine_title_v6(title, exclude_words=[]):
   # Text sanitization for better refinement
    title = re.sub('[^a-zA-Z\s]', '', title).lower()
    
    # Tokenize and POS tag
    tokens = word_tokenize(title)
    tagged_tokens = pos_tag(tokens)
    
    # Named Entity Recognition
    named_entities = [chunk[0][0] for chunk in ne_chunk(tagged_tokens) if hasattr(chunk, 'label')]
    
    # Fall back to nouns and verbs if not enough named entities
    if len(named_entities) < 2:
        nouns_verbs = [word for word, pos in tagged_tokens if pos[:2] in ['NN', 'VB']]
        named_entities = nouns_verbs
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    # Uninformative phrases or words
    uninformative_phrases = ['york', 'times', 'bbc', 'cnn', 'fox', 'news', 'daily', 'post', 'guardian']
    
    # Filtering tokens
    filtered_tokens = [word for word in named_entities if word.lower() not in stop_words and word.lower() not in uninformative_phrases and word.lower() not in exclude_words and word.isalnum()]
    
    # Create query string
    query = '+'.join(filtered_tokens)
    
    return query


In [45]:
# Tries to find similar articles by refining the query until similar articles are found.
def find_similar_articles(original_title):
    # List to keep track of words to exclude in subsequent retries
    exclude_words = []
    
    # Retry loop
    while True:
        # Refine the title
        refined_title = refine_title_v6(original_title, exclude_words)
        
        print(f"Trying with refined title: {refined_title}")
        
        # Identify a word to exclude in the next retry (e.g., the last word in the refined title)
        if '+' in refined_title:
            word_to_exclude = refined_title.split('+')[-1]
            exclude_words.append(word_to_exclude)
        else:
            print("No more words to exclude. Exiting.")
            break


In [46]:
# Fetches similar articles from the News API.
def get_similar_articles_from_api(user_title, api_key):
    base_url = "https://newsapi.org/v2/everything"
    params = {
        "qInTitle": user_title,
        "apiKey": api_key,
        "sortBy": "relevancy",
        "pageSize": 10
    }
    response = requests.get(base_url, params=params)
    data = response.json()

    if data['status'] != 'ok':
        print(f"API Error: {data.get('message', 'Unknown error')}")
        return []

    if not data['articles']:
        params = {
            "q": user_title,
            "apiKey": api_key,
            "sortBy": "relevancy",
            "pageSize": 10
        }
        response = requests.get(base_url, params=params)
        data = response.json()
        
        if data['status'] != 'ok':
            print(f"API Error: {data.get('message', 'Unknown error')}")
            return []

    # Filter out articles that have "[Removed]" in their title or content
    similar_articles = [
        {
            "title": article['title'], 
            "url": article['url'], 
            "content": article.get('content', '')
        } 
        for article in data['articles'] 
        if article.get('content') and "[Removed]" not in article['title'] and "[Removed]" not in article.get('content', '')
    ]

    return similar_articles

In [47]:
# Splits content into sentences for easier processing (Used in classify and highlight functions)
def split_content_into_sentences(content):
    return sent_tokenize(content)


In [48]:
# It takes the user's article and a list of similar articles, then classifies the user's article as "true" or "misleading". It also identifies discrepancies and potentially biased sentences.
def classify_and_highlight_article(user_content, similar_articles, classification_threshold=0.75, discrepancy_threshold=0.50, early_stop_threshold=5):
    bias_keywords = ['fake', 'conspiracy', 'hoax', 'untrustworthy']  
    
    if not similar_articles:
        print("No similar articles found for comparison.")
        return "undetermined", [], []

    user_sentences = split_content_into_sentences(user_content)
    user_sentence_embeddings = get_universal_sentence_embeddings(user_sentences)
    
    overall_similarities = []
    
    for article in similar_articles:
        similar_sentences = split_content_into_sentences(article['content'])
        
        if not similar_sentences:
            print("Skipping an article with no content.")
            continue
        
        similar_sentence_embeddings = get_universal_sentence_embeddings(similar_sentences)
        
        sentence_similarities = np.dot(user_sentence_embeddings, similar_sentence_embeddings.T).max(axis=1)
        
        overall_similarities.append(sentence_similarities)
    
    if not overall_similarities:
        print("No similarities could be computed.")
        return "undetermined", [], []

    overall_similarities = np.array(overall_similarities)
    avg_similarities = np.mean(overall_similarities, axis=0)

    if np.isscalar(avg_similarities):
        avg_similarities = [avg_similarities]

    if np.mean(avg_similarities) >= classification_threshold:
        classification = "true"
    else:
        classification = "misleading"

    discrepancies = [user_sentences[i] for i, sim in enumerate(avg_similarities) if sim < discrepancy_threshold]
    
    # Early stopping
    if len(discrepancies) >= early_stop_threshold:
        return classification, discrepancies[:early_stop_threshold], []

    biased_sentences = [sentence for sentence in user_sentences if any(keyword in sentence.lower() for keyword in bias_keywords)]
    
    return classification, discrepancies, biased_sentences

In [49]:

def main():
    url = input("Enter the URL of the article: ")
    if not is_valid_url(url):
        print("Invalid URL format.")
        return

    title = get_title_from_url(url)
    print(f"Extracted Title: {title}")
    refined_title = refine_title_v6(title)
    print(f"Refined Title: {refined_title}")

    similar_articles = get_similar_articles_from_api(refined_title, "332ff33479434583874bc9021b68a14b")

    if not similar_articles:
        print("No similar articles found.")
        return

    print("\nSimilar Articles:")
    for article in similar_articles:
        print(f"Title: {article['title']}")
        print(f"URL: {article['url']}")
        print(f"Content: {article['content']}")
        print("-" * 50)

    # Get full HTML instead of just content
    full_html = get_full_html_from_url(url)
    
    user_content = get_content_from_url(url)
    classification, discrepancies, biased_sentences = classify_and_highlight_article(user_content, similar_articles)
    print(f"\nThe user's article is classified as: {classification}")

    if biased_sentences:
        print(f"\nSentences containing potential bias: {biased_sentences}")

    # Highlight discrepancies if any
    if discrepancies:
        html_path = highlight_discrepancies_in_html(full_html, discrepancies, similar_articles)
        print(f"The highlighted article is saved at: {html_path}")


if __name__ == "__main__":
    main()


Extracted Title: Live worm found in Australian woman’s brain in world-first discovery
Refined Title: worm+found+womans+brain+discovery

Similar Articles:
Title: Live parasitic worm found in Australian woman’s brain in world first
URL: https://www.aljazeera.com/news/2023/8/29/live-parasitic-worm-found-in-australian-womans-brain-in-world-first
Content: A live parasitic worm has been found inside the brain of a 64-year-old Australian woman, marking the first case of the infection in humans.
The discovery was made by doctors and researchers at the A… [+3425 chars]
--------------------------------------------------
Title: Live Worm Found In Woman’s Brain Is Rare—But Common Deadly Parasites Already Plague Billions Of People
URL: https://www.forbes.com/sites/roberthart/2023/09/01/live-worm-found-in-womans-brain-is-rare-but-common-deadly-parasites-already-plague-billions-of-people/
Content: Neurosurgeons in Australia this week reported pulling a live, three-inch worm from a womans brain, a sho