In [22]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [52]:
#Scraping 100 headlines from a Fox News search query for "campus protests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import time

# Set up the Selenium WebDriver
driver = webdriver.Chrome()  # or webdriver.Firefox(), etc.

# Navigate to the page
url = "https://www.foxnews.com/search-results/search?q=campus%20protest"
driver.get(url)

# Function to click the "Load More" button
def click_load_more(driver):
    try:
        load_more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'button load-more')]/a/span[contains(text(), 'Load More')]"))
        )
        driver.execute_script("arguments[0].click();", load_more_button)
        time.sleep(5)  # Wait for new results to load
        return True
    except (NoSuchElementException, TimeoutException):
        print("No 'Load more results' button found or timeout occurred.")
        return False

# Initialize an empty set to store unique headlines
all_headlines = set()

# Loop to click "Load More" until we have at least 100 headlines
while len(all_headlines) < 100:
    # Get the page source after JavaScript has run
    html_content = driver.page_source
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find the headlines (you'll need to inspect the loaded page to find the correct selectors)
    headlines = soup.select('h2.title')  # Replace with the correct selector
    # Extract text from each headline and add to the set
    for headline in headlines:
        all_headlines.add(headline.text.strip())
    # Check if we have enough headlines
    if len(all_headlines) >= 100:
        break
    # Click the "Load More" button
    if not click_load_more(driver):
        # Print the page source for debugging
        with open('debug.html', 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        break

# Convert the set to a list and print the first 100 headlines
all_headlines = list(all_headlines)
for headline in all_headlines[:100]:
    print(headline)

# Don't forget to close the browser
driver.quit()

Ivy League anti-Israel agitators' protests spiral into 'actual terror organization,' professor warns
Anti-Israel agitators at MIT take down barrier, retake campus encampment after police cleared it
Emory University rips anti-Israel 'activists' disrupting campus; police use tear gas, zip-ties during arrests
Minnesota police clear out anti-Israel protest in the heart of Ilhan Omar's congressional district
Columbia University locks down campus buildings following overnight mutiny: 'Effective immediately'
College protests reveal alarming terrorist support. And jihadis cheer them on
Universities crack down on anti-Israel agitators as protesters call for 'amnesty'
More wild anti-Israel protesters descend on Columbia University lawn vowing to 'hold this line'
NYPD removes Palestinian flag from CCNY campus, reraises American flag after anti-Israel protest
White House condemns ‘blatantly antisemitic’ protests as agitators engulf Columbia University
Ilhan Omar daughter barred from campus housing

In [78]:
#Putting the scraped headlines into a dataframe
df = pd.DataFrame(all_headlines, columns= ['Headline'])
df.tail()

Unnamed: 0,Headline
97,Police arrest thousands at colleges across the...
98,Police make arrests at GWU anti-Israel encampm...
99,UCLA anti-Israel protesters ask supporters for...
100,"UT Austin protests descend into chaos, anti-Is..."
101,UNC Chapel Hill board votes to dismantle DEI p...


In [79]:
#Cleaning the dataframe
pattern = r'^(By Content|Date Range|to)$'
df_clean = df[~df['Headline'].str.match(pattern)]
df_clean.head(20)

Unnamed: 0,Headline
0,Ivy League anti-Israel agitators' protests spi...
1,Anti-Israel agitators at MIT take down barrier...
2,Emory University rips anti-Israel 'activists' ...
3,Minnesota police clear out anti-Israel protest...
4,Columbia University locks down campus building...
5,College protests reveal alarming terrorist sup...
6,Universities crack down on anti-Israel agitato...
7,More wild anti-Israel protesters descend on Co...
8,NYPD removes Palestinian flag from CCNY campus...
9,White House condemns ‘blatantly antisemitic’ p...


In [81]:
#Saving the dataframe as a csv file
df_clean.to_csv('fox_headline_final.csv', index=False)

In [83]:
#Now we will run the textual analysis
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from io import StringIO  # Add this import

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to perform sentiment analysis on a text
def analyze_sentiment(df_clean):
    return sia.polarity_scores(df_clean)

# Apply sentiment analysis to the 'headline' column
df_clean['sentiment'] = df_clean['Headline'].apply(analyze_sentiment)

# Extract compound sentiment score
df_clean['compound_sentiment'] = df_clean['sentiment'].apply(lambda x: x['compound'])

# Calculate average sentiment
average_sentiment = df_clean['compound_sentiment'].mean()

print("Sentiment analysis results:")
print(f"Average sentiment score: {average_sentiment:.4f}")

# Display individual headline sentiments
print("\nIndividual headline sentiments:")
for index, row in df_clean.iterrows():
    print(f"Headline: {row['Headline']}")
    print(f"Sentiment: {row['compound_sentiment']:.4f}")
    print()

Sentiment analysis results:
Average sentiment score: -0.3176

Individual headline sentiments:
Headline: Ivy League anti-Israel agitators' protests spiral into 'actual terror organization,' professor warns
Sentiment: -0.8316

Headline: Anti-Israel agitators at MIT take down barrier, retake campus encampment after police cleared it
Sentiment: -0.4939

Headline: Emory University rips anti-Israel 'activists' disrupting campus; police use tear gas, zip-ties during arrests
Sentiment: -0.4404

Headline: Minnesota police clear out anti-Israel protest in the heart of Ilhan Omar's congressional district
Sentiment: 0.1531

Headline: Columbia University locks down campus buildings following overnight mutiny: 'Effective immediately'
Sentiment: 0.4767

Headline: College protests reveal alarming terrorist support. And jihadis cheer them on
Sentiment: -0.2732

Headline: Universities crack down on anti-Israel agitators as protesters call for 'amnesty'
Sentiment: -0.6124

Headline: More wild anti-Israel

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sofiaahmed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['sentiment'] = df_clean['Headline'].apply(analyze_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['compound_sentiment'] = df_clean['sentiment'].apply(lambda x: x['compound'])
