In [None]:
!pip install wikipedia-api newspaper3k pandas nltk transformers torch scikit-learn tqdm regex

In [None]:
import os
import requests
import pandas as pd
from newspaper import Article
import wikipediaapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from tqdm import tqdm
import torch
import re

# Download NLTK data
nltk.download('punkt')

In [None]:
def clean_sentence(sentence):
    """
    Cleans the input sentence by removing footnote markers, newlines, and extra spaces.
    Ensures the sentence is concise.
    """
    # Remove footnote markers like [1], [2], etc.
    sentence = re.sub(r'\[\d+\]', '', sentence)
    # Remove newlines and replace multiple spaces with a single space
    sentence = re.sub(r'\s+', ' ', sentence)
    # Strip leading/trailing spaces
    sentence = sentence.strip()
    # Exclude very short sentences
    if len(sentence) < 20:
        return ''
    return sentence

In [None]:
# Check for CUDA availability
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

# Initialize Hugging Face pipelines
# 1. Claim Detection - Using Zero-Shot Classification
claim_detection_pipeline = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)

# 2. Fact Verification - Using DeBERTa model fine-tuned on FEVER
fact_verification_pipeline = pipeline(
    "text-classification",
    model="learn3r/deberta-v3-base-finetuned-fever",
    return_all_scores=True,
    device=device
)

In [None]:
# Configuration
SCIENCE_CATEGORIES = [
    # "Physics",
    # "Chemistry",
    # "Biology",
    # "Astronomy",
    # "Earth science",
    # "Computer science",
    # "Mathematics",
    # "Engineering",
    "Environmental science",
    # "Medicine"
]
WIKI_LANGUAGE = 'en'
MAX_ARTICLES_PER_CATEGORY = 5
MAX_SENTENCE_LENGTH = 200  # Maximum characters per claim

# Domain Keywords
DOMAIN_KEYWORDS = {
    # 'Physics': ['physics', 'quantum', 'relativity', 'energy'],
    # 'Chemistry': ['chemistry', 'molecule', 'reaction', 'compound'],
    # 'Biology': ['biology', 'cell', 'gene', 'ecosystem'],
    # 'Astronomy': ['astronomy', 'galaxy', 'star', 'planet'],
    # 'Earth science': ['earth', 'geology', 'climate', 'soil'],
    # 'Computer science': ['computer', 'algorithm', 'software', 'hardware'],
    # 'Mathematics': ['mathematics', 'calculus', 'algebra', 'geometry'],
    # 'Engineering': ['engineering', 'mechanical', 'electrical', 'civil'],
    'Environmental science': ['environment', 'sustainability', 'conservation', 'pollution'],
    # 'Medicine': ['medicine', 'health', 'disease', 'therapy']
}

In [None]:
class WikipediaScraper:
    def __init__(self, language=WIKI_LANGUAGE, user_agent='MyFactCheckTool/1.0'):
        self.wiki = wikipediaapi.Wikipedia(
            language=language,
            extract_format=wikipediaapi.ExtractFormat.WIKI,
            user_agent=user_agent
        )

    def get_science_articles(self, categories, max_articles=10):
        """
        Fetches article URLs from the given categories.
        """
        article_urls = []
        for category in tqdm(categories, desc="Fetching Categories"):
            cat = self.wiki.page(f'Category:{category}')
            if not cat.exists():
                print(f"Category '{category}' does not exist.")
                continue
            articles = self._get_articles_from_category(cat, max_articles)
            for article in articles:
                article_urls.append(article.fullurl)
        print(f"Fetched a total of {len(article_urls)} articles from Wikipedia.")
        return article_urls

    def _get_articles_from_category(self, category_page, max_articles):
        """
        Retrieves articles from a specific category page.
        """
        articles = []
        for c in category_page.categorymembers.values():
            if c.ns == wikipediaapi.Namespace.MAIN and len(articles) < max_articles:
                articles.append(c)
        return articles

In [None]:
class WebScraper:
    def __init__(self, urls):
        self.urls = urls

    def extract_statements(self):
        """
        Extracts factual statements from the list of URLs.
        """
        statements = []
        for url in tqdm(self.urls, desc="Scraping Articles"):
            try:
                article = Article(url)
                article.download()
                article.parse()
                paragraphs = sent_tokenize(article.text)
                factual_claims = self.extract_factual_claims(paragraphs)
                for claim in factual_claims:
                    cleaned_claim = clean_sentence(claim)
                    if cleaned_claim:  # Ensure the claim is not empty after cleaning
                        statements.append({'Statement': cleaned_claim, 'Source': url})
            except Exception as e:
                print(f"Error processing {url}: {e}")
        print(f"Extracted a total of {len(statements)} factual statements from articles.")
        return statements

    def extract_factual_claims(self, paragraphs):
        """
        Identifies factual claims within paragraphs.
        """
        factual_claims = []
        for paragraph in paragraphs:
            sentences = sent_tokenize(paragraph)
            for sentence in sentences:
                if self.is_factual(sentence):
                    factual_claims.append(sentence)
        return factual_claims

    def is_factual(self, sentence):
        """
        Determines if a sentence is a factual claim using the claim detection pipeline.
        """
        if len(sentence) > MAX_SENTENCE_LENGTH:
            return False  # Skip sentences that are too long
        candidate_labels = ['factual', 'opinion']
        try:
            result = claim_detection_pipeline(sentence, candidate_labels)
            # Check if the highest scored label is 'factual' and score > 0.5
            return result['labels'][0] == 'factual' and result['scores'][0] > 0.5
        except Exception as e:
            print(f"Error in claim detection: {e}")
            return False

In [None]:
class FactChecker:
    def __init__(self, pipeline):
        """
        Initializes the FactChecker with a Hugging Face fact verification pipeline.
        """
        self.pipeline = pipeline

    def verify_statement(self, statement):
        """
        Verifies the statement's veracity ('True', 'False', 'Unknown') using the fact verification pipeline.
        """
        try:
            result = self.pipeline(statement)
            # The model returns scores for 'LABEL_0', 'LABEL_1', 'LABEL_2'
            # Typically, 'LABEL_0' = 'NOT ENOUGH INFO', 'LABEL_1' = 'SUPPORTS', 'LABEL_2' = 'REFUTES'
            if not result:
                return 'Unknown'

            labels = result[0]
            top_label = labels[0]['label']
            return [x['score'] for x in labels]
        except Exception as e:
            print(f"Error in fact verification: {e}")
            return 'Unknown'

In [None]:
class DomainClassifier:
    def __init__(self, keywords=DOMAIN_KEYWORDS):
        self.keywords = keywords

    def classify_domain(self, statement):
        statement_lower = statement.lower()
        for domain, kws in self.keywords.items():
            for kw in kws:
                if kw in statement_lower:
                    return domain
        return 'Unknown'

In [None]:
# Initialize Scrapers and Checkers
wiki_scraper = WikipediaScraper()


# Fetch Article URLs
article_urls = wiki_scraper.get_science_articles(SCIENCE_CATEGORIES, MAX_ARTICLES_PER_CATEGORY)

# Scrape Articles and Extract Statements
web_scraper = WebScraper(article_urls)
statements = web_scraper.extract_statements()



In [None]:
def get_label(probabilities):
    return probabilities[0] > 0.7 and probabilities[1]<0.3

In [None]:
fact_checker = FactChecker(fact_verification_pipeline)
domain_classifier = DomainClassifier()  # Ensure the model is trained
# Process Statements: Fact-Check and Domain Classification
for stmt in tqdm(statements, desc="Processing Statements"):
    statement_text = stmt['Statement']
    source_url = stmt['Source']

    # Fact-Check the Statement
    label = fact_checker.verify_statement(statement_text)

    # Classify the Domain
    domain = domain_classifier.classify_domain(statement_text)

    # Update the statement with label and domain
    stmt['Label'] = label
    stmt['Domain'] = domain

# Create DataFram
df = pd.DataFrame(statements, columns=['Statement', 'Label', 'Domain', 'Source'])
df['Label'] = df['Label'].apply(get_label)
df = df[df['Domain']!='Unknown']
# give claims that start with Capital letter
df = df[df['Statement'].str[0].str.isupper()]
# shuffle
df = df.sample(frac=1).reset_index(drop=True)
# Display the DataFrame
df.head(50)

# save to excel
df.to_excel('statements.xlsx', index=False)