# Part 1: Webscrapping Benchmarking

The news website I have chosen for this project contains a number of articles and it will help me calculate the execution times for web scrapping using two different libraries 'BeautifulSoup' and 'Selenium'. First, let's see how we can scrape data using beautifulSoup.

## Webscrapping using beautifulSoup

In [1]:
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import sys
import time
import numpy as np

In [2]:
pagesToGet = 1
upperframe = []

# Start measuring time
start_time = time.time()

for page in range(1, pagesToGet + 1):
    print('Processing page:', page)
    url = 'https://www.politifact.com/factchecks/list/?page=' + str(page)
    print(url)

    try:
        page = requests.get(url)  # Request the webpage
    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print('ERROR FOR LINK:', url)
        print(error_type, 'Line:', error_info.tb_lineno)
        continue

    time.sleep(2)  # Pause to avoid being blocked
    soup = BeautifulSoup(page.text, 'html.parser')
    frame = []
    links = soup.find_all('li', attrs={'class': 'o-listicle__item'})
    num_articles = len(links)
    print(f'Number of articles found on page: {num_articles}')

    filename = "NEWS_beautifulSoup.csv"
    with open(filename, "w", encoding="utf-8") as f:
        headers = "Statement,Link,Source\n"
        f.write(headers)

        for j in links:
            Statement = j.find("div", attrs={'class': 'm-statement__quote'}).text.strip()
            Link = "https://www.politifact.com" + j.find("div", attrs={'class': 'm-statement__quote'}).find('a')['href'].strip()
            Source = j.find('div', attrs={'class': 'm-statement__meta'}).find('a').text.strip()
            
            frame.append((Statement, Link, Source))
            f.write(Statement.replace(",", "^") + "," + Link + "," +  "," + Source.replace(",", "^") + "," + "\n")

    upperframe.extend(frame)

# End measuring time
end_time = time.time()
execution_time = end_time - start_time


Processing page: 1
https://www.politifact.com/factchecks/list/?page=1
Number of articles found on page: 30


In [3]:

print(f"\nTotal Execution Time: {execution_time:.2f} seconds")

# Convert data to a Pandas DataFrame
data = pd.DataFrame(upperframe, columns=['Statement', 'Link',  'Source'])
data.head()

# Save the DataFrame to a CSV file
data.to_csv('NEWS_beautifulSoup.csv', index=False)



Total Execution Time: 3.43 seconds


In [4]:
data.head()

Unnamed: 0,Statement,Link,Source
0,“Hillary Clinton was sending classified docume...,https://www.politifact.com/factchecks/2025/mar...,Markwayne Mullin
1,Image shows a “person dressed up as Pikachu to...,https://www.politifact.com/factchecks/2025/mar...,Social Media
2,North Carolina “always had nonpartisan judicia...,https://www.politifact.com/factchecks/2025/mar...,Sydney Batch
3,Videos show tornadoes hitting Seattle on March...,https://www.politifact.com/factchecks/2025/mar...,TikTok posts
4,“We have automobile plants being built at leve...,https://www.politifact.com/factchecks/2025/mar...,Donald Trump


## Webscrapping using Selenium

let's use the same website to scrape data using library 'Selenium'

In [5]:
pip install selenium pandas webdriver-manager





In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


# Set up Selenium WebDriver options (headless mode for faster execution)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in the background (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Start timing the execution
start_time = time.time()

# Define the number of pages to scrape
pages_to_get = 1  
base_url = "https://www.politifact.com/factchecks/list/?page="

# Store scraped data
scraped_data = []

for page in range(1, pages_to_get + 1):
    url = base_url + str(page)
    driver.get(url)
    time.sleep(2)  # Allow time for JavaScript to load

    # Locate elements
    articles = driver.find_elements(By.CSS_SELECTOR, "li.o-listicle__item")

    for article in articles:
        try:
            statement = article.find_element(By.CSS_SELECTOR, "div.m-statement__quote").text.strip()
        except:
            statement = "No statement found"
        
        try:
            link = article.find_element(By.CSS_SELECTOR, "div.m-statement__quote a").get_attribute("href")
        except:
            link = "No link found"

        try:
            source = article.find_element(By.CSS_SELECTOR, "div.m-statement__meta a").text.strip()
        except:
            source = "No source found"

        scraped_data.append((statement, link, source))

# Stop timing the execution
end_time = time.time()
execution_time = end_time - start_time

# Convert to Pandas DataFrame
df = pd.DataFrame(scraped_data, columns=["Statement", "URL", "Source"])

# Print execution time
print(f"Execution Time: {execution_time:.6f} seconds")
print(f"Number of articles: {len(articles)}")

# Save results
df.to_csv("NEWS_Selenium.csv", index=False)

# Close the browser
driver.quit()




Execution Time: 5.457042 seconds
Number of articles: 30


In [7]:
df.head()

Unnamed: 0,Statement,URL,Source
0,“Hillary Clinton was sending classified docume...,https://www.politifact.com/factchecks/2025/mar...,Markwayne Mullin
1,Image shows a “person dressed up as Pikachu to...,https://www.politifact.com/factchecks/2025/mar...,Social Media
2,North Carolina “always had nonpartisan judicia...,https://www.politifact.com/factchecks/2025/mar...,Sydney Batch
3,Videos show tornadoes hitting Seattle on March...,https://www.politifact.com/factchecks/2025/mar...,TikTok posts
4,“We have automobile plants being built at leve...,https://www.politifact.com/factchecks/2025/mar...,Donald Trump


Analysing the total execution time of BeautifulSoup library and Selenium library, it is found that BeautifulSoup execution time is faster than Selenium execution time. Hence we can move forward with BeautifulSoup library to scrape 100 or more articles from the same website we are using. 

In [8]:
# Number of articles needed
articles_needed = 100
collected_articles = 0

# Start measuring time
start_time = time.time()

# Storage for articles
article_list = []

# Start scraping
page = 1

while collected_articles < articles_needed:
    print(f'Processing page {page}...')
    url = f'https://www.politifact.com/factchecks/list/?page={page}'
    
    try:
        page_response = requests.get(url, timeout=10)  # Request with timeout
    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print(f'ERROR FOR LINK: {url}')
        print(error_type, 'Line:', error_info.tb_lineno)
        continue

    time.sleep(2)  # Pause to avoid being blocked
    soup = BeautifulSoup(page_response.text, 'html.parser')
    links = soup.find_all('li', attrs={'class': 'o-listicle__item'})

    # Extract articles from the page
    for j in links:
        if collected_articles >= articles_needed:
            break  # Stop once we reach 100 articles
        
        try:
            Statement = j.find("div", attrs={'class': 'm-statement__quote'}).text.strip()
            Link = "https://www.politifact.com" + j.find("div", attrs={'class': 'm-statement__quote'}).find('a')['href'].strip()
            Source = j.find('div', attrs={'class': 'm-statement__meta'}).find('a').text.strip()
            
            article_list.append((Statement, Link, Source))
            collected_articles += 1

        except AttributeError:
            continue  # Skip if any element is missing

    print(f"Total articles collected so far: {collected_articles}")

    page += 1  # Move to the next page

# Save to CSV
data_100 = pd.DataFrame(article_list, columns=['Statement', 'Link', 'Source'])
data_100.to_csv("NEWS_100_articles.csv", index=False, encoding="utf-8")

# End measuring time
end_time = time.time()
execution_time = end_time - start_time

print(f"\nTotal Articles Scraped: {len(data_100)}")
print(f"Execution Time: {execution_time:.2f} seconds")
print("Shape: ", data_100.shape)
print("\nData saved to NEWS_100_articles.csv")

Processing page 1...
Total articles collected so far: 30
Processing page 2...
Total articles collected so far: 60
Processing page 3...
Total articles collected so far: 90
Processing page 4...
Total articles collected so far: 100

Total Articles Scraped: 100
Execution Time: 9.28 seconds
Shape:  (100, 3)

Data saved to NEWS_100_articles.csv


In the above code, I have scrapped 4 pages to scrape 100 articles using BeautifulSoup. The total execution time for the 100 articles is calculated above.

# Part 2: Text Analysis

In [9]:
!pip install vaderSentiment




In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# TF-IDF and VADER

In [11]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(data_100['Statement'])

# Get top words contributing to importance
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = np.mean(X.toarray(), axis=0)
important_words = dict(zip(feature_names, tfidf_scores))

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_text(text):
    sentiment = analyzer.polarity_scores(text)
    importance_score = sum([important_words.get(word, 0) for word in text.split()])  # Sum TF-IDF scores
    direction = "Positive" if sentiment['compound'] > 0 else "Negative"
    return importance_score, direction

# Apply analysis to each article
data_100[['Importance Score', 'Direction']] = data_100['Statement'].apply(lambda x: pd.Series(analyze_text(x)))

# Display results
data_100[['Statement', 'Importance Score', 'Direction']].head(10)

Unnamed: 0,Statement,Importance Score,Direction
0,“Hillary Clinton was sending classified docume...,0.0,Negative
1,Image shows a “person dressed up as Pikachu to...,0.045012,Negative
2,North Carolina “always had nonpartisan judicia...,0.031774,Negative
3,Videos show tornadoes hitting Seattle on March...,0.0,Negative
4,“We have automobile plants being built at leve...,0.011346,Negative
5,Phoenix’s increased wildfire risk is linked to...,0.0,Negative
6,“Canada makes bold decision to shut down Tesla...,0.0,Positive
7,“Elon Musk exposes hidden $20K roof grant. Con...,0.0,Positive
8,"""Brad Schimel said that he wanted to be part o...",0.016785,Positive
9,Video shows the Bitcoin whitepaper “was spotte...,0.045012,Negative


In [12]:
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK sentence tokenizer is downloaded
nltk.download('punkt')


# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(df['Statement'])

# Get top words contributing to importance
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = np.mean(X.toarray(), axis=0)
important_words = dict(zip(feature_names, tfidf_scores))

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_text(text):
    # Compute sentiment scores
    sentiment = analyzer.polarity_scores(text)
    
    # Compute importance score based on TF-IDF word weights
    importance_score = sum([important_words.get(word, 0) for word in text.split()])
    
    # Determine direction
    direction = "Positive" if sentiment['compound'] > 0 else "Negative"

    # Extract top 2 sentences with the highest sentiment polarity
    sentences = sent_tokenize(text)
    sentences_scored = [(sent, analyzer.polarity_scores(sent)['compound']) for sent in sentences]
    
    # Sort sentences by absolute polarity score and pick top 2
    top_sentences = sorted(sentences_scored, key=lambda x: abs(x[1]), reverse=True)[:2]
    summary = " ".join([s[0] for s in top_sentences])

    return summary, importance_score, direction

# Apply analysis to each article
data_100[['Summary', 'Importance Score', 'Direction']] = data_100['Statement'].apply(lambda x: pd.Series(analyze_text(x)))

# Reorder columns
data_100 = data_100[['Statement', 'Summary', 'Importance Score', 'Direction', 'Link', 'Source']]

# Display first few results
data_100.head()

# Save results to CSV
data_100.to_csv("NEWS_text_analysis_results.csv", index=False)
print("\nAnalysis completed. Data saved to NEWS_text_analysis_results.csv")



Analysis completed. Data saved to NEWS_text_analysis_results.csv


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
