In [None]:
    !pip install requests beautifulsoup4 vaderSentiment



In [None]:
import requests
from bs4 import BeautifulSoup
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Function to scrape headlines from The Indian Express website using the updated CSS selector
def scrape_headlines():
    url = "https://indianexpress.com/"  # You can replace this with any news website
    response = requests.get(url)

    if response.status_code != 200:
        print("Failed to retrieve the webpage")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Scrape the headlines using the updated CSS path
    headlines = []
    try:
        # Use the provided CSS selector for selecting the headline section
        headline_elements = soup.select('div#HP_LATEST_NEWS.lead-stories.event-track-class.single_latest_news div.left-part div.other-article')  # Adjusting the selector

        for element in headline_elements:
            headline = element.get_text(strip=True)
            if headline:
                headlines.append(headline)
    except Exception as e:
        print(f"Error while scraping: {e}")

    return headlines

# Function to analyze sentiment of the headlines
def analyze_sentiment(headlines):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = {"positive": 0, "negative": 0, "neutral": 0}

    for headline in headlines:
        sentiment_score = analyzer.polarity_scores(headline)['compound']

        if sentiment_score > 0.05:
            sentiments["positive"] += 1
        elif sentiment_score < -0.05:
            sentiments["negative"] += 1
        else:
            sentiments["neutral"] += 1

    # Calculate overall sentiment
    total_headlines = len(headlines)
    if total_headlines == 0:
        return "No headlines to analyze"

    positive_percentage = (sentiments["positive"] / total_headlines) * 100
    negative_percentage = (sentiments["negative"] / total_headlines) * 100
    neutral_percentage = (sentiments["neutral"] / total_headlines) * 100

    # Assess overall positivity
    if positive_percentage > negative_percentage:
        overall_sentiment = "Overall Positive"
    elif negative_percentage > positive_percentage:
        overall_sentiment = "Overall Negative"
    else:
        overall_sentiment = "Neutral"

    return {
        "sentiment_analysis": sentiments,
        "overall_sentiment": overall_sentiment,
        "positive_percentage": positive_percentage,
        "negative_percentage": negative_percentage,
        "neutral_percentage": neutral_percentage
    }

# Main function to run the scraping and sentiment analysis

# Step 1: Scrape headlines from The Indian Express
headlines = scrape_headlines()

if not headlines:
    print("No headlines found.")


print(f"Scraped {len(headlines)} headlines:")
for idx, headline in enumerate(headlines, 1):
    print(f"{idx}. {headline}")

# Step 2: Perform sentiment analysis on the headlines
analysis_result = analyze_sentiment(headlines)

# Display the sentiment analysis results
print("\nSentiment Analysis Results:")
print(f"Positive Headlines: {analysis_result['sentiment_analysis']['positive']}")
print(f"Negative Headlines: {analysis_result['sentiment_analysis']['negative']}")
print(f"Neutral Headlines: {analysis_result['sentiment_analysis']['neutral']}")
print(f"Overall Sentiment: {analysis_result['overall_sentiment']}")
print(f"Positive Headlines Percentage: {analysis_result['positive_percentage']:.2f}%")
print(f"Negative Headlines Percentage: {analysis_result['negative_percentage']:.2f}%")
print(f"Neutral Headlines Percentage: {analysis_result['neutral_percentage']:.2f}%")



Scraped 15 headlines:
1. Express ResearchHow Jawaharlal Nehru wrote the history of India
2. ExplainedWhy Bangladesh is rewriting textbooks on 1971 Liberation WarSign In to read
3. SC to hear on February 17 Owaisi’s plea seeking implementation of Places of Worship Act
4. Express OpinionPicture of a Gurugram farmer beside London Bridge shows aspiration of our timesSubscriber Only
5. Watch videoMoment when Tesla Cybertruck burst into flames outside Las Vegas hotel
6. 337 mt of toxic waste: The challenge as work begins to clear Bhopal gas tragedy site
7. Who’s going to keep their VIP security? VK Singh, Gautam Gambhir among those facing MHA review
8. Long ReadsHow Kashmir is curbing heart attack deathsSubscriber Only
9. Maharashtra polls done and dusted, calls from both NCP factions for 'reunion'
10. Border-Gavaskar TrophyAfter omitting him for Melbourne, India need Gill in must-win 5th Test
11. The story of Binodini Dasi, after whom Star Theatre in Kolkata has now been namedSubscriber Onl

In [None]:
# Assuming you have 15 headlines, update the categories list to have 15 labels
categories = [
    "Technology", "Politics", "Sports", "Business", "Entertainment",
    "Politics", "Lifestyle", "Sports", "Politics", "Technology",
    "Health", "Business", "Lifestyle", "Technology", "Entertainment"
]


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:

# Tokenize the headlines
tokenizer = Tokenizer(num_words=10000)  # Top 10,000 words
tokenizer.fit_on_texts(headlines)
X = tokenizer.texts_to_sequences(headlines)

# Pad the sequences to ensure all have the same length
X_padded = pad_sequences(X, padding='post', maxlen=50)

# Label encoding for the categories
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(categories)  # Convert labels to integers

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
