In [32]:
!pip3 install requests beautifulsoup4 tqdm urllib3 newspaper3k



In [33]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import logging
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from newspaper import Article

# Setup logging
logging.basicConfig(
    filename='fact_check_scraper.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

import re

def extract_urls(text):
    """Extract all URLs from the given text using regex."""
    # Define the regex pattern to match URLs
    url_pattern = r'(https?://[^\s]+)'
    
    # Find all matching URLs
    urls = re.findall(url_pattern, text)
    logging.info(f"Extracted {len(urls)} URLs from the text.")
    
    return urls

def setup_session():
    """Configure a session with retry strategy for HTTP requests."""
    session = requests.Session()
    retry = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    logging.info("HTTP session configured with retry strategy.")
    return session

session = setup_session()

def access_url(url):
    """Fetch data from the given URL and return the parsed JSON response."""
    url = f'https://r.jina.ai/{url}'
    headers = {
        'Accept': 'application/json',
        'X-With-Images-Summary': 'true',
        'X-With-Links-Summary': 'true',
        "X-Return-Format": "html"
    }
    try:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json().get('data', {})
        logging.info(f"Successfully accessed URL: {url}")
        return data
    except requests.RequestException as e:
        logging.error(f"RequestException for URL {url}: {e}")
    except ValueError as e:
        logging.error(f"ValueError for URL {url}: {e}")
    return {}

# Initialize an empty list to store article links
article_links = []

def collect_article_links(target_count=500):
    """Collect a target number of article links from Snopes."""
    page_num = 1
    while len(article_links) < target_count:
        url = f'https://www.snopes.com/fact-check/?pagenum={page_num}'
        logging.info(f"Processing page for links: {url}")
        page = access_url(url)
        if not page:
            logging.warning(f"No data found for page {page_num}. Ending link collection.")
            break
        new_links = [
            link for link in page.get('links', {}).values()
            if 'fact-check' in link and link.endswith('/')
        ][1:]
        if not new_links:
            logging.warning(f"No new links found on page {page_num}. Ending link collection.")
            break
        for link in new_links:
            if len(article_links) >= target_count:
                break
            if link not in article_links:
                article_links.append(link)
        logging.info(f"Collected {len(article_links)} links so far.")
        page_num += 1
        time.sleep(1)  # Respectful delay between page requests
    logging.info(f"Finished collecting links. Total links collected: {len(article_links)}")

def extract_article_data(soup, url):
    """Extract relevant data from the BeautifulSoup object."""
    try:
        label = soup.find('div', class_='rating_title_wrap').get_text(strip=True).replace('About this rating', '')
        logging.info(f"Extracted label: {label}")
    except AttributeError:
        label = 'N/A'
        logging.warning("Label not found, set to 'N/A'.")

    try:
        context = soup.find('p', class_='outer_fact_check_context').get_text(strip=True)
        logging.info(f"Extracted context: {context}")
    except AttributeError:
        context = ''
        logging.warning("Context not found, set to empty string.")

    try:
        claim = soup.find('div', class_='claim_wrapper').div.get_text(strip=True)
        logging.info(f"Extracted claim: {claim}")
    except AttributeError:
        claim = 'N/A'
        logging.warning("Claim not found, set to 'N/A'.")

    # Extract all URLs from the entire article text
    try:
        article_text = soup.get_text()
        links = extract_urls(article_text)
        logging.info(f"Extracted {len(links)} links from the article text.")
    except Exception as e:
        logging.error(f"Error extracting links from article: {e}")
        links = []

    return {'claim': claim, 'label': label, 'context': context, 'links': links, 'url': url}

def process_article_links():
    """Process each article link to extract data."""
    for a in tqdm(article_links, desc="Processing Articles"):
        article = Article(a)
        article.download()
        article.parse()
        html = article.html
        soup = BeautifulSoup(html, 'html.parser')
        article_data = extract_article_data(soup, a)
        articles_data.append(article_data)
        logging.info(f"Processed article: {article_data['claim']}")
        # time.sleep(1)  # Respectful delay between article requests

# Initialize an empty list to store article data
articles_data = []

collect_article_links(target_count=50)
process_article_links()
df = pd.DataFrame(articles_data)
df.to_json('df.json', orient='records', lines=True)
df.head()

Processing Articles: 100%|██████████| 50/50 [00:20<00:00,  2.42it/s]


Unnamed: 0,claim,label,context,links,url
0,"British writer Virginia Woolf said: ""There is ...",Unproven,,[https://www.virginiawoolfsociety.org.uk/resou...,https://www.snopes.com/fact-check/virginia-woo...
1,American flags were not visible at a rally sup...,True,,[https://6abc.com/post/bruce-springsteen-john-...,https://www.snopes.com/fact-check/flags-harris...
2,"Elon Musk's brother, Kimbal Musk, accurately r...",Unproven,,[https://www.youtube.com/watch?v=CgV2KzyWKx0&l...,https://www.snopes.com/fact-check/was-elon-mus...
3,Entertainment icon Oprah Winfrey is trying to ...,Labeled Satire,,[https://www.cbsnews.com/news/hurricane-helene...,https://www.snopes.com/fact-check/oprah-buying...
4,Whoopi Goldberg and Joy Behar are giving up th...,Labeled Satire,,[],https://www.snopes.com/fact-check/the-view-sal...


In [1]:
import pandas as pd
df = pd.read_json('df.json', orient='records', lines=True)

In [38]:
df

Unnamed: 0,claim,label,context,links,url
0,"British writer Virginia Woolf said: ""There is ...",Unproven,,[https://www.virginiawoolfsociety.org.uk/resou...,https://www.snopes.com/fact-check/virginia-woo...
1,American flags were not visible at a rally sup...,True,,[https://6abc.com/post/bruce-springsteen-john-...,https://www.snopes.com/fact-check/flags-harris...
2,"Elon Musk's brother, Kimbal Musk, accurately r...",Unproven,,[https://www.youtube.com/watch?v=CgV2KzyWKx0&l...,https://www.snopes.com/fact-check/was-elon-mus...
3,Entertainment icon Oprah Winfrey is trying to ...,Labeled Satire,,[https://www.cbsnews.com/news/hurricane-helene...,https://www.snopes.com/fact-check/oprah-buying...
4,Whoopi Goldberg and Joy Behar are giving up th...,Labeled Satire,,[],https://www.snopes.com/fact-check/the-view-sal...
5,A picture authentically shows a fully restored...,Fake,,[https://www.merriam-webster.com/dictionary/ch...,https://www.snopes.com/fact-check/1875-chuckwa...
6,Photographs show NYC’s sanitation department i...,Miscaptioned,,[https://www.nytimes.com/2024/10/28/us/politic...,https://www.snopes.com/fact-check/keep-nyc-tra...
7,Adolf Hitler was nominated for the Nobel Peace...,True,,[https://www.nobelprize.org/prizes/facts/facts...,https://www.snopes.com/fact-check/adolf-hitler...
8,A map shared to social media in mid-October 20...,Mostly True,,[https://doc.arcgis.com/en/arcgis-online/get-s...,https://www.snopes.com/fact-check/texas-map-eu...
9,The last remaining Blockbuster Video store. lo...,False,,[https://www.centraloregondaily.com/news/local...,https://www.snopes.com/fact-check/last-blockbu...


In [2]:
df.head()

Unnamed: 0,claim,label,context,links,url
0,"British writer Virginia Woolf said: ""There is ...",Unproven,,[https://www.virginiawoolfsociety.org.uk/resou...,https://www.snopes.com/fact-check/virginia-woo...
1,American flags were not visible at a rally sup...,True,,[https://6abc.com/post/bruce-springsteen-john-...,https://www.snopes.com/fact-check/flags-harris...
2,"Elon Musk's brother, Kimbal Musk, accurately r...",Unproven,,[https://www.youtube.com/watch?v=CgV2KzyWKx0&l...,https://www.snopes.com/fact-check/was-elon-mus...
3,Entertainment icon Oprah Winfrey is trying to ...,Labeled Satire,,[https://www.cbsnews.com/news/hurricane-helene...,https://www.snopes.com/fact-check/oprah-buying...
4,Whoopi Goldberg and Joy Behar are giving up th...,Labeled Satire,,[],https://www.snopes.com/fact-check/the-view-sal...


# Langfuse integration

In [3]:
!pip3 install langfuse


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [31]:
from langfuse import Langfuse

# Initialize Langfuse client
langfuse = Langfuse(
  secret_key="sk-lf-afabb036-f29f-4963-8de2-be91e2f7b05f",
  public_key="pk-lf-224dc436-a6e0-44e8-869b-3352b126557b",
  host="https://cloud.langfuse.com"
)

# Create a dataset for fact-checking
dataset = langfuse.create_dataset(name="fact_check_dataset")

In [32]:
import pandas as pd


local_items = df.apply(
    lambda row: {
        "input": {
            "claim": row["claim"],
            "context": row["context"],
            "links": row["links"]
        },
        "expected_output": row["label"]
    },
    axis=1
).tolist()

for item in local_items:
    langfuse.create_dataset_item(
        dataset_name="fact_check_dataset",
        input=item["input"],
        expected_output=item["expected_output"]
    )

# logging.info(f"Created {len(local_items)} local_items from DataFrame.")

In [33]:
# we use a very simple eval here, you can use any eval library
# see https://langfuse.com/docs/scores/model-based-evals for details
def simple_evaluation(output, expected_output):
  return output == expected_output

In [37]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
from langfuse import Langfuse, observe

def run_my_langchain_llm_app(input, system_message, callback_handler):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                system_message,
            ),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    # Configure ChatOpenAI with controlled randomness
    chat = ChatOpenAI(model="gpt-4o-mini", temperature=0.7, top_p=0.9)
    chain = prompt | chat

    res = chain.invoke(
        { "messages": [HumanMessage(content=input)] },
        config={"callbacks":[callback_handler]}
    )
    
    return res

In [34]:
def run_langchain_experiment(experiment_name, system_message):
    dataset = langfuse.get_dataset("fact_check_dataset")

    for item in dataset.items:
        handler = item.get_langchain_handler(run_name=experiment_name)

        completion = run_my_langchain_llm_app(item.input["claim"], system_message, handler).content
        print(completion)
        handler.trace.score(
            name="exact_match",
            value=completion
        )

In [35]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [36]:
run_langchain_experiment(
    'Truth',
    "Provide a score between 0 and 1 based on the truthfulness of the claim just as a float number."
)

0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.2
0.0
0.9
0.0
0.0
0.0
1.0
0.0
0.8
0.0
0.0
0.0
0.8
0.9
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.9
0.0
0.0
0.8
0.0
0.0


# Wiki integration

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import logging

# Ensure tqdm works with pandas apply
tqdm.pandas()

# Function to check Wikipedia support and retrieve correlated article links using Wikipedia's API
def check_wikipedia_support_and_links(claim, top_n=5):
    """
    Check if the claim is supported by Wikipedia and retrieve correlated article links.
    
    Args:
        claim (str): The claim to verify.
        top_n (int): Number of top correlated Wikipedia articles to return.
        
    Returns:
        tuple:
            - bool: True if support is found, False otherwise.
            - list: List of top_n Wikipedia article URLs supporting the claim.
    """
    wikipedia_api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'search',
        'srsearch': claim,
        'srlimit': top_n,
        'srnamespace': 0,  # Search only in articles
        'srsort': 'relevance',
        'format': 'json'
    }
    try:
        response = requests.get(wikipedia_api_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        search_results = data.get('query', {}).get('search', [])
        
        if search_results:
            # Sort search results by score in descending order to ensure high relevance
            sorted_results = sorted(search_results, key=lambda x: x.get('score', 0), reverse=True)
            # Construct full Wikipedia URLs from the top search results
            links = [f"https://en.wikipedia.org/wiki/{result['title'].replace(' ', '_')}" for result in sorted_results[:top_n]]
            logging.info(f"Wikipedia support found for claim: {claim}")
            return True, links
        else:
            logging.info(f"No Wikipedia support for claim: {claim}")
            return False, []
    except requests.RequestException as e:
        logging.error(f"Error querying Wikipedia for claim '{claim}': {e}")
        return False, []

df_ = df.head(50)
# Apply the Wikipedia support check to the DataFrame with improved correlation
tqdm.pandas(desc="Checking Wikipedia Support")
df_[['wikipedia_support', 'wikipedia_links']] = df_['claim'].progress_apply(
    lambda x: pd.Series(check_wikipedia_support_and_links(x))
)

# Expand the links into separate columns
links_df = df_['wikipedia_links'].apply(pd.Series)
links_df.columns = [f'wikipedia_link_{i+1}' for i in range(links_df.shape[1])]
df_ = pd.concat([df_, links_df], axis=1)

# Filter the DataFrame to include only supported claims
df_supported = df_[df_['wikipedia_support']].reset_index(drop=True)

# Display the filtered DataFrame
df_supported

# Save the filtered DataFrame to a new JSON file
# df_supported.to_json('df_wikipedia_supported.json', orient='records', lines=True)
# logging.info(f"Filtered DataFrame saved to df_wikipedia_supported.json with {len(df_supported)} entries.")

NameError: name 'df' is not defined

In [None]:
df_supported

In [5]:
df_supported

Unnamed: 0,claim,label,context,links,url,wikipedia_support,wikipedia_links
0,McDonald’s stock went up 200% following a visi...,Labeled Satire,,[https://www.newsweek.com/did-mcdonalds-stock-...,https://www.snopes.com/fact-check/mcdonalds-st...,True,[https://en.wikipedia.org/wiki/List_of_people_...
1,The Quaker Oats Co. announced that Aunt Jemima...,Labeled Satire,,[https://www.snopes.com//fact-check/aunt-jemim...,https://www.snopes.com/fact-check/quaker-oats-...,True,[https://en.wikipedia.org/wiki/Pepsi]
2,"In the 1980s, U.S. Vice President Kamala Harri...",Unproven,,[https://www.washingtonpost.com/food/2024/08/1...,https://www.snopes.com/fact-check/harris-job-m...,True,"[https://en.wikipedia.org/wiki/Jimmy_Carter, h..."
3,A video shared online in mid-October 2024 auth...,Fake,,[https://www.snopes.com//fact-check/giants-sou...,https://www.snopes.com/fact-check/egypt-pyrami...,True,[https://en.wikipedia.org/wiki/Muslim_Brotherh...
4,U.S. Vice President Kamala Harris expressed su...,True,,[https://assets.aclu.org/live/uploads/2024/08/...,https://www.snopes.com/fact-check/harris-gende...,True,[https://en.wikipedia.org/wiki/Political_posit...
5,An image authentically showed a memo McDonald’...,Research In Progress,,[https://www.washingtonpost.com/politics/2024/...,https://www.snopes.com/fact-check/mcdonalds-me...,True,[https://en.wikipedia.org/wiki/List_of_common_...
6,A video authentically shows a woman holding a ...,Fake,,"[https://translate.google.com/.""Jyo, https://w...",https://www.snopes.com/fact-check/giant-white-...,True,"[https://en.wikipedia.org/wiki/Blackface, http..."
7,"A sequence in the horror movie ""Poltergeist"" s...",True,,[https://bloodygoodhorror.com/bgh/interviews/b...,https://www.snopes.com/fact-check/poltergeist-...,True,[https://en.wikipedia.org/wiki/It_(miniseries)...
8,Photographs shared online in September 2024 sh...,Mixture,,"[https://www.youtube.com/watch?v=PgVXPEORuA0.,...",https://www.snopes.com/fact-check/china-apartm...,True,[https://en.wikipedia.org/wiki/Timeline_of_the...
9,"Research shows, for pet owners, the deaths of ...",True,,"[https://www.aplb.org/., https://doi.org/10.10...",https://www.snopes.com/fact-check/losing-pet-h...,True,[https://en.wikipedia.org/wiki/Cruelty_to_anim...
