In [7]:
#Webscraping 100 headlines from CNN search results for "campus protest"
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd


def scrape_page(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.container__headline-text')))
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    headlines = soup.select('span.container__headline-text')
    return [headline.text.strip() for headline in headlines]

# Set up the Selenium WebDriver
driver = webdriver.Chrome()  # or webdriver.Firefox(), etc.

base_url = "https://www.cnn.com/search?q=campus+protest&from={}&size=10&page=1&sort=relevance&types=all&section="
num_pages = 10  # Number of pages you want to scrape

all_headlines = []
page_numbers = []

try:
    for page in range(num_pages):
        from_param = page * 10
        url = base_url.format(from_param)
        print(f"Scraping page {page + 1}...")
        page_headlines = scrape_page(driver, url)
        all_headlines.extend(page_headlines)
        page_numbers.extend([page + 1] * len(page_headlines))
        print(f"Found {len(page_headlines)} headlines on page {page + 1}")


finally:
    driver.quit()

# Create a DataFrame
df = pd.DataFrame({
    'Headline': all_headlines,
})

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
cnn_headlines = f"cnn_headlines_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(cnn_headlines, index=False)
print(f"Data saved to {cnn_headlines}")


Scraping page 1...
Found 10 headlines on page 1
Scraping page 2...
Found 10 headlines on page 2
Scraping page 3...
Found 10 headlines on page 3
Scraping page 4...
Found 10 headlines on page 4
Scraping page 5...
Found 10 headlines on page 5
Scraping page 6...
Found 10 headlines on page 6
Scraping page 7...
Found 10 headlines on page 7
Scraping page 8...
Found 10 headlines on page 8
Scraping page 9...
Found 10 headlines on page 9
Scraping page 10...
Found 10 headlines on page 10
                                            Headline
0  What we know about the protests erupting on co...
1                     Campus protests: now, and then
2  In pictures: A lookback at student protest mov...
3  Clashes escalate at campus protests nationwide...
4         Arrests at U.S. university campus protests


NameError: name 'datetime' is not defined

In [9]:
df.head(20)

Unnamed: 0,Headline
0,What we know about the protests erupting on co...
1,"Campus protests: now, and then"
2,In pictures: A lookback at student protest mov...
3,Clashes escalate at campus protests nationwide...
4,Arrests at U.S. university campus protests
5,Smerconish: Campus protests shouldn't upend cl...
6,"At the student protest at UPenn, passions are ..."
7,Outsiders left UCLA protesters beaten and bloody
8,Video shows protests at UCLA as violent confro...
9,Opinion: Student protests are what created the...


In [38]:
df.shape[0]

100

In [1]:
df.head(20)

NameError: name 'df' is not defined

In [39]:
len(df)

100

In [40]:
#Now we will run the textual analysis
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from io import StringIO  # Add this import

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to perform sentiment analysis on a text
def analyze_sentiment(df):
    return sia.polarity_scores(df)

# Apply sentiment analysis to the 'headline' column
df['sentiment'] = df['Headline'].apply(analyze_sentiment)

# Extract compound sentiment score
df['compound_sentiment'] = df['sentiment'].apply(lambda x: x['compound'])

# Calculate average sentiment
average_sentiment = df['compound_sentiment'].mean()

print("Sentiment analysis results:")
print(f"Average sentiment score: {average_sentiment:.4f}")

# Display individual headline sentiments
print("\nIndividual headline sentiments:")
for index, row in df.iterrows():
    print(f"Headline: {row['Headline']}")
    print(f"Sentiment: {row['compound_sentiment']:.4f}")
    print()

Sentiment analysis results:
Average sentiment score: -0.2825

Individual headline sentiments:
Headline: What we know about the protests erupting on college campuses across America
Sentiment: -0.2263

Headline: Campus protests: now, and then
Sentiment: -0.2263

Headline: In pictures: A lookback at student protest movements in the US
Sentiment: -0.2500

Headline: Clashes escalate at campus protests nationwide as law enforcement makes mass arrests
Sentiment: -0.5859

Headline: Arrests at U.S. university campus protests
Sentiment: -0.5859

Headline: Smerconish: Campus protests shouldn't upend classes or graduation
Sentiment: -0.2263

Headline: At the student protest at UPenn, passions are clear and dialogue is lacking
Sentiment: 0.5859

Headline: Outsiders left UCLA protesters beaten and bloody
Sentiment: -0.7650

Headline: Video shows protests at UCLA as violent confrontation breaks out
Sentiment: -0.7964

Headline: Opinion: Student protests are what created the university as we know it
S

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sofiaahmed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
