In [3]:
#Webscraping 100 headlines from the Washington Post from a search query for "campus protest"
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime

def scrape_headlines(driver):
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h2.wpds-c-gYjOKE')))
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    headlines = soup.select('h2.wpds-c-gYjOKE')
    return [headline.text.strip() for headline in headlines]

def click_load_more(driver):
    try:
        # Try to find the "Load More Results" button and click it using JavaScript
        load_more_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Load more results')]")
        driver.execute_script("arguments[0].click();", load_more_button)
        time.sleep(5)  # Wait for new results to load
        return True
    except NoSuchElementException:
        print("No 'Load more results' button found.")
        return False

# Set up the Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=options)

url = "https://www.washingtonpost.com/search/?query=campus%20protest&sort=Relevance&datefilter=All%20Since%202005"
target_headline_count = 100
max_load_attempts = 10

all_headlines = []

try:
    driver.get(url)
    load_attempts = 0
    
    while len(all_headlines) < target_headline_count and load_attempts < max_load_attempts:
        new_headlines = scrape_headlines(driver)
        
        if len(new_headlines) > len(all_headlines):
            all_headlines = new_headlines
            print(f"Found {len(all_headlines)} headlines")
            
            if click_load_more(driver):
                load_attempts = 0
            else:
                load_attempts += 1
        else:
            load_attempts += 1
            print(f"No new headlines found. Attempt {load_attempts}/{max_load_attempts}")
        
        time.sleep(2)  # Wait a bit before next attempt

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    driver.quit()

# Create a DataFrame
df = pd.DataFrame({
    'Headline': all_headlines,
    'Date_Scraped': datetime.now().strftime("%Y-%m-%d")
})

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
wa_po = f"wapo_headlines_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(wa_po, index=False)
print(f"Data saved to {wa_po}")

# Print some basic statistics
print(f"\nTotal headlines scraped: {len(df)}")

Found 10 headlines
No new headlines found. Attempt 1/10
No new headlines found. Attempt 2/10
No new headlines found. Attempt 3/10
No new headlines found. Attempt 4/10
No new headlines found. Attempt 5/10
No new headlines found. Attempt 6/10
No new headlines found. Attempt 7/10
Found 20 headlines
No new headlines found. Attempt 1/10
No new headlines found. Attempt 2/10
No new headlines found. Attempt 3/10
No new headlines found. Attempt 4/10
No new headlines found. Attempt 5/10
No new headlines found. Attempt 6/10
Found 28 headlines
No new headlines found. Attempt 1/10
No new headlines found. Attempt 2/10
No new headlines found. Attempt 3/10
No new headlines found. Attempt 4/10
No new headlines found. Attempt 5/10
No new headlines found. Attempt 6/10
No new headlines found. Attempt 7/10
Found 35 headlines
No new headlines found. Attempt 1/10
No new headlines found. Attempt 2/10
No new headlines found. Attempt 3/10
No new headlines found. Attempt 4/10
No new headlines found. Attempt 5/10

In [4]:
df

Unnamed: 0,Headline,Date_Scraped
0,Opinion|Amnesty is the one demand campus prote...,2024-08-04
1,Opinion|The crackdown on campus protests has g...,2024-08-04
2,U.S. campus protests spread to the Middle East...,2024-08-04
3,College students are protesting schools’ ties ...,2024-08-04
4,‘Bunker mentality’ at Columbia lit protest spa...,2024-08-04
...,...,...
95,Opinion|‘Professor told he’s not safe on campu...,2024-08-04
96,U. Missouri leaders uphold decision to fire pr...,2024-08-04
97,Opinion|How civil rights law distorts the anti...,2024-08-04
98,Professor: If people can protest the national ...,2024-08-04


In [5]:
df.tail()

Unnamed: 0,Headline,Date_Scraped
95,Opinion|‘Professor told he’s not safe on campu...,2024-08-04
96,U. Missouri leaders uphold decision to fire pr...,2024-08-04
97,Opinion|How civil rights law distorts the anti...,2024-08-04
98,Professor: If people can protest the national ...,2024-08-04
99,UCLA ordered by judge to craft plan in support...,2024-08-04


In [8]:
#Running the sentiment analysis
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from io import StringIO  # Add this import

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to perform sentiment analysis on a text
def analyze_sentiment(df):
    return sia.polarity_scores(df)

# Apply sentiment analysis to the 'headline' column
df['sentiment'] = df['Headline'].apply(analyze_sentiment)

# Extract compound sentiment score
df['compound_sentiment'] = df['sentiment'].apply(lambda x: x['compound'])

# Calculate average sentiment
average_sentiment = df['compound_sentiment'].mean()

print("Sentiment analysis results:")
print(f"Average sentiment score: {average_sentiment:.4f}")

# Display individual headline sentiments
print("\nIndividual headline sentiments:")
for index, row in df.iterrows():
    print(f"Headline: {row['Headline']}")
    print(f"Sentiment: {row['compound_sentiment']:.4f}")
    print()

Sentiment analysis results:
Average sentiment score: -0.2453

Individual headline sentiments:
Headline: Opinion|Amnesty is the one demand campus protesters should drop
Sentiment: -0.5423

Headline: Opinion|The crackdown on campus protests has gone way too far
Sentiment: -0.2263

Headline: U.S. campus protests spread to the Middle East and Europe
Sentiment: -0.2263

Headline: College students are protesting schools’ ties to Israel. Here’s why.
Sentiment: -0.4215

Headline: ‘Bunker mentality’ at Columbia lit protest spark that spread nationwide
Sentiment: -0.0258

Headline: Opinion|Call the campus protests what they are
Sentiment: -0.2263

Headline: Analysis | Americans are more likely to oppose than support campus protests
Sentiment: 0.2023

Headline: What college protests could look like across the U.S. as summer begins
Sentiment: 0.1531

Headline: Police clear University of Chicago camp in latest move to quell protests
Sentiment: 0.1779

Headline: In campus protests, students are wary

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sofiaahmed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
