In [1]:
# Advanced Current Affairs Web Scraper for Google Colab
# Optimized for comprehensive news aggregation from global sources

# ============================================================================
# INSTALLATION AND IMPORTS
# ============================================================================

# Install required libraries
!pip install requests beautifulsoup4 lxml[html_clean] pandas selenium newspaper3k feedparser aiohttp asyncio python-dateutil textstat

# For Selenium in Colab
!apt update
!apt install chromium-chromedriver

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import asyncio
import aiohttp
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse
import re
import feedparser
import newspaper
from newspaper import Article
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# COMPREHENSIVE NEWS SOURCES DATABASE
# ============================================================================

NEWS_SOURCES = {
    "global_english": {
        "BBC": {
            "rss": "https://feeds.bbci.co.uk/news/rss.xml",
            "world": "https://feeds.bbci.co.uk/news/world/rss.xml",
            "politics": "https://feeds.bbci.co.uk/news/politics/rss.xml",
            "business": "https://feeds.bbci.co.uk/news/business/rss.xml"
        },
        "CNN": {
            "top": "http://rss.cnn.com/rss/edition.rss",
            "world": "http://rss.cnn.com/rss/edition_world.rss",
            "politics": "http://rss.cnn.com/rss/edition_politics.rss",
            "business": "http://rss.cnn.com/rss/money_latest.rss"
        },
        "Reuters": {
            "world": "http://feeds.reuters.com/Reuters/worldNews",
            "politics": "http://feeds.reuters.com/Reuters/PoliticsNews",
            "business": "http://feeds.reuters.com/reuters/businessNews",
            "top": "http://feeds.reuters.com/reuters/topNews"
        },
        "Guardian": {
            "world": "https://www.theguardian.com/world/rss",
            "politics": "https://www.theguardian.com/politics/rss",
            "business": "https://www.theguardian.com/business/rss"
        },
        "Al Jazeera": {
            "main": "https://www.aljazeera.com/xml/rss/all.xml"
        },
        "Associated Press": {
            "top": "https://apnews.com/rss"
        },
        "NPR": {
            "news": "https://feeds.npr.org/1001/rss.xml"
        }
    },
    "indian_english": {
        "Times of India": {
            "top": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms",
            "india": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
            "world": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
            "business": "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms"
        },
        "Hindu": {
            "national": "https://www.thehindu.com/news/national/?service=rss",
            "international": "https://www.thehindu.com/news/international/?service=rss",
            "business": "https://www.thehindu.com/business/?service=rss"
        },
        "Indian Express": {
            "main": "https://indianexpress.com/feed/"
        },
        "NDTV": {
            "main": "https://feeds.feedburner.com/NDTV-LatestNews"
        },
        "Firstpost": {
            "main": "https://www.firstpost.com/commonfeeds/v1/eng/rss/India.xml"
        }
    },
    "us_sources": {
        "NY Times": {
            "home": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
            "world": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
            "politics": "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml"
        },
        "Washington Post": {
            "politics": "http://feeds.washingtonpost.com/rss/politics"
        },
        "Politico": {
            "main": "https://www.politico.com/rss/politics08.xml"
        }
    },
    "regional": {
        "France24": {
            "main": "https://www.france24.com/en/rss"
        },
        "DW": {
            "main": "https://rss.dw.com/rdf/rss-en-all"
        }
    }
}

# Current affairs keywords for filtering relevant content
CURRENT_AFFAIRS_KEYWORDS = [
    'politics', 'government', 'election', 'policy', 'parliament', 'congress', 'senate',
    'minister', 'president', 'prime minister', 'cabinet', 'legislation', 'law',
    'economy', 'gdp', 'inflation', 'budget', 'tax', 'finance', 'market', 'trade',
    'international', 'diplomacy', 'foreign policy', 'summit', 'treaty', 'war',
    'conflict', 'crisis', 'protest', 'strike', 'reform', 'scandal', 'investigation',
    'court', 'justice', 'ruling', 'verdict', 'supreme court', 'high court',
    'social', 'education', 'healthcare', 'environment', 'climate', 'energy'
]

# ============================================================================
# ADVANCED CURRENT AFFAIRS SCRAPER CLASS
# ============================================================================

class CurrentAffairsScraper:
    """
    Advanced scraper optimized for comprehensive current affairs collection
    from global news sources with intelligent filtering and text processing.
    """

    def __init__(self, max_articles_per_source=20, days_back=7):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.max_articles_per_source = max_articles_per_source
        self.days_back = days_back
        self.cutoff_date = datetime.now() - timedelta(days=days_back)
        self.all_articles = []
        self.processed_articles = []

    def is_current_affairs_relevant(self, title, description="", content=""):
        """Check if article is relevant to current affairs using keyword matching."""
        text_to_check = f"{title} {description} {content}".lower()

        # Count keyword matches
        keyword_matches = sum(1 for keyword in CURRENT_AFFAIRS_KEYWORDS if keyword in text_to_check)

        # Consider relevant if has multiple keyword matches or specific high-value keywords
        high_value_keywords = ['government', 'politics', 'election', 'policy', 'parliament', 'minister', 'president']
        has_high_value = any(keyword in text_to_check for keyword in high_value_keywords)

        return keyword_matches >= 2 or has_high_value

    def parse_rss_feed(self, rss_url, source_name, category="general"):
        """Parse RSS feed and extract current affairs articles."""
        articles = []
        try:
            feed = feedparser.parse(rss_url)

            for entry in feed.entries[:self.max_articles_per_source]:
                try:
                    # Parse publication date
                    pub_date = None
                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
                        pub_date = datetime(*entry.published_parsed[:6])
                    elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                        pub_date = datetime(*entry.updated_parsed[:6])

                    # Skip old articles
                    if pub_date and pub_date < self.cutoff_date:
                        continue

                    title = entry.get('title', '')
                    description = entry.get('description', '') or entry.get('summary', '')
                    link = entry.get('link', '')

                    # Check if relevant to current affairs
                    if self.is_current_affairs_relevant(title, description):
                        article = {
                            'source': source_name,
                            'category': category,
                            'title': title,
                            'description': BeautifulSoup(description, 'html.parser').get_text(),
                            'link': link,
                            'published': pub_date.isoformat() if pub_date else datetime.now().isoformat(),
                            'scraped_at': datetime.now().isoformat()
                        }
                        articles.append(article)

                except Exception as e:
                    print(f"Error parsing entry from {source_name}: {e}")
                    continue

        except Exception as e:
            print(f"Error parsing RSS feed {rss_url}: {e}")

        return articles

    def extract_full_article_content(self, url):
        """Extract full article content using newspaper3k."""
        try:
            article = Article(url)
            article.download()
            article.parse()

            return {
                'full_text': article.text,
                'authors': article.authors,
                'keywords': article.keywords,
                'summary': article.summary if hasattr(article, 'summary') else ''
            }
        except Exception as e:
            print(f"Error extracting content from {url}: {e}")
            return {}

    async def scrape_source_async(self, source_data, source_name):
        """Asynchronously scrape a single news source."""
        source_articles = []

        for category, rss_url in source_data.items():
            articles = self.parse_rss_feed(rss_url, source_name, category)
            source_articles.extend(articles)

        return source_articles

    def scrape_all_sources(self):
        """Scrape all configured news sources for current affairs."""
        print("🚀 Starting comprehensive current affairs scraping...")
        start_time = time.time()

        all_articles = []
        total_sources = sum(len(sources) for region_sources in NEWS_SOURCES.values()
                          for sources in region_sources.values())
        processed_sources = 0

        # Process each region
        for region, sources in NEWS_SOURCES.items():
            print(f"\n📰 Scraping {region.replace('_', ' ').title()} sources...")

            for source_name, source_data in sources.items():
                try:
                    print(f"  → Processing {source_name}...")

                    source_articles = []
                    for category, rss_url in source_data.items():
                        articles = self.parse_rss_feed(rss_url, source_name, category)
                        source_articles.extend(articles)

                    all_articles.extend(source_articles)
                    processed_sources += 1

                    print(f"    ✓ Found {len(source_articles)} current affairs articles")

                    # Add delay to be respectful
                    time.sleep(0.5)

                except Exception as e:
                    print(f"    ✗ Error processing {source_name}: {e}")
                    continue

        self.all_articles = all_articles

        elapsed_time = time.time() - start_time
        print(f"\n✅ Scraping completed in {elapsed_time:.2f} seconds")
        print(f"📊 Total articles collected: {len(all_articles)}")
        print(f"📈 Sources processed: {processed_sources}/{total_sources}")

        return all_articles

    def deduplicate_articles(self):
        """Remove duplicate articles based on title similarity."""
        unique_articles = []
        seen_titles = set()

        for article in self.all_articles:
            # Create a normalized title for comparison
            normalized_title = re.sub(r'[^a-zA-Z0-9\s]', '', article['title'].lower())
            normalized_title = ' '.join(normalized_title.split())

            # Check for duplicates
            is_duplicate = False
            for seen_title in seen_titles:
                # Simple similarity check - if 80% of words match
                title_words = set(normalized_title.split())
                seen_words = set(seen_title.split())

                if len(title_words) > 0 and len(seen_words) > 0:
                    similarity = len(title_words & seen_words) / len(title_words | seen_words)
                    if similarity > 0.8:
                        is_duplicate = True
                        break

            if not is_duplicate:
                unique_articles.append(article)
                seen_titles.add(normalized_title)

        print(f"🔄 Removed {len(self.all_articles) - len(unique_articles)} duplicate articles")
        self.all_articles = unique_articles
        return unique_articles

    def categorize_articles(self):
        """Categorize articles by topic area."""
        categories = {
            'politics': ['politic', 'government', 'parliament', 'election', 'minister', 'cabinet'],
            'economy': ['economy', 'economic', 'gdp', 'inflation', 'budget', 'finance', 'market'],
            'international': ['international', 'foreign', 'diplomacy', 'summit', 'treaty', 'global'],
            'social': ['social', 'education', 'healthcare', 'society', 'culture'],
            'legal': ['court', 'justice', 'legal', 'law', 'ruling', 'verdict'],
            'environment': ['environment', 'climate', 'energy', 'pollution', 'green']
        }

        for article in self.all_articles:
            text_to_analyze = f"{article['title']} {article['description']}".lower()

            # Assign primary category based on keyword matches
            max_matches = 0
            primary_category = 'general'

            for category, keywords in categories.items():
                matches = sum(1 for keyword in keywords if keyword in text_to_analyze)
                if matches > max_matches:
                    max_matches = matches
                    primary_category = category

            article['topic_category'] = primary_category

    def generate_summary_text(self):
        """Generate a comprehensive text summary of all current affairs."""
        if not self.all_articles:
            return "No current affairs articles found."

        # Sort articles by date (most recent first)
        sorted_articles = sorted(self.all_articles,
                               key=lambda x: x['published'], reverse=True)

        # Generate summary text
        summary_parts = []
        summary_parts.append(f"COMPREHENSIVE CURRENT AFFAIRS SUMMARY")
        summary_parts.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        summary_parts.append(f"Period covered: Last {self.days_back} days")
        summary_parts.append(f"Total articles: {len(sorted_articles)}")
        summary_parts.append("=" * 80)

        # Group by category
        categories = {}
        for article in sorted_articles:
            category = article.get('topic_category', 'general')
            if category not in categories:
                categories[category] = []
            categories[category].append(article)

        # Generate category-wise summaries
        for category, articles in categories.items():
            if not articles:
                continue

            summary_parts.append(f"\n{category.upper()} ({len(articles)} articles)")
            summary_parts.append("-" * 50)

            for i, article in enumerate(articles[:10], 1):  # Top 10 per category
                summary_parts.append(f"{i}. {article['title']}")
                summary_parts.append(f"   Source: {article['source']} | Published: {article['published'][:10]}")
                if article['description']:
                    description = article['description'][:200] + "..." if len(article['description']) > 200 else article['description']
                    summary_parts.append(f"   {description}")
                summary_parts.append(f"   URL: {article['link']}")
                summary_parts.append("")

        # Add source statistics
        summary_parts.append("\nSOURCE STATISTICS")
        summary_parts.append("-" * 50)
        source_counts = {}
        for article in sorted_articles:
            source = article['source']
            source_counts[source] = source_counts.get(source, 0) + 1

        for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
            summary_parts.append(f"{source}: {count} articles")

        return "\n".join(summary_parts)

    def save_results(self, output_format='text'):
        """Save results in specified format."""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        if output_format == 'text':
            filename = f'current_affairs_{timestamp}.txt'
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(self.generate_summary_text())
            print(f"📄 Text summary saved to: {filename}")

        elif output_format == 'json':
            filename = f'current_affairs_{timestamp}.json'
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(self.all_articles, f, ensure_ascii=False, indent=2)
            print(f"📄 JSON data saved to: {filename}")

        elif output_format == 'csv':
            filename = f'current_affairs_{timestamp}.csv'
            df = pd.DataFrame(self.all_articles)
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"📄 CSV data saved to: {filename}")

        return filename

# ============================================================================
# MAIN EXECUTION FUNCTIONS
# ============================================================================

def scrape_comprehensive_current_affairs(days_back=7, max_articles_per_source=15):
    """
    Main function to scrape comprehensive current affairs from all sources.

    Args:
        days_back (int): Number of days to look back for articles
        max_articles_per_source (int): Maximum articles to collect per RSS feed

    Returns:
        str: Formatted text summary of current affairs
    """
    print("🌍 COMPREHENSIVE CURRENT AFFAIRS SCRAPER")
    print("=" * 50)

    # Initialize scraper
    scraper = CurrentAffairsScraper(max_articles_per_source, days_back)

    # Scrape all sources
    articles = scraper.scrape_all_sources()

    if not articles:
        return "No current affairs articles found. Please check your internet connection and try again."

    # Process articles
    print("\n🔄 Processing articles...")
    scraper.deduplicate_articles()
    scraper.categorize_articles()

    # Generate and save summary
    summary_text = scraper.generate_summary_text()

    # Save in multiple formats
    scraper.save_results('text')
    scraper.save_results('json')
    scraper.save_results('csv')

    return summary_text

def quick_current_affairs_update():
    """Quick function for immediate current affairs update."""
    return scrape_comprehensive_current_affairs(days_back=3, max_articles_per_source=10)

def deep_current_affairs_analysis():
    """Comprehensive function for detailed current affairs analysis."""
    return scrape_comprehensive_current_affairs(days_back=14, max_articles_per_source=25)

# ============================================================================
# EXAMPLE USAGE
# ============================================================================

if __name__ == "__main__":
    print("🚀 Starting Comprehensive Current Affairs Scraping...")

    # Get comprehensive current affairs summary
    summary = scrape_comprehensive_current_affairs()

    # Display first part of summary
    print("\n" + "="*80)
    print(summary[:2000] + "\n[... Summary continues in saved files ...]")
    print("="*80)

# ============================================================================
# QUICK START EXAMPLES FOR COLAB
# ============================================================================

"""
QUICK START GUIDE FOR CURRENT AFFAIRS SCRAPING:

1. BASIC USAGE:
   summary = scrape_comprehensive_current_affairs()
   print(summary)

2. QUICK UPDATE (Last 3 days):
   summary = quick_current_affairs_update()

3. DEEP ANALYSIS (Last 14 days):
   summary = scrape_comprehensive_current_affairs(days_back=14)

4. CUSTOM PARAMETERS:
   summary = scrape_comprehensive_current_affairs(days_back=5, max_articles_per_source=20)

5. ACCESS RAW DATA:
   scraper = CurrentAffairsScraper()
   articles = scraper.scrape_all_sources()
   scraper.save_results('json')  # Save as JSON
   scraper.save_results('csv')   # Save as CSV

FEATURES:
✓ 25+ Global news sources (BBC, CNN, Reuters, Times of India, etc.)
✓ Intelligent current affairs filtering
✓ Automatic categorization (Politics, Economy, International, etc.)
✓ Duplicate removal
✓ Multiple output formats (Text, JSON, CSV)
✓ Date-based filtering
✓ Source statistics and analytics
"""

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting asyncio
  Downloading asyncio-4.0.0-py3-none-any.whl.metadata (994 bytes)
Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting lxml_html_clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.wh

"\nQUICK START GUIDE FOR CURRENT AFFAIRS SCRAPING:\n\n1. BASIC USAGE:\n   summary = scrape_comprehensive_current_affairs()\n   print(summary)\n\n2. QUICK UPDATE (Last 3 days):\n   summary = quick_current_affairs_update()\n\n3. DEEP ANALYSIS (Last 14 days):\n   summary = scrape_comprehensive_current_affairs(days_back=14)\n\n4. CUSTOM PARAMETERS:\n   summary = scrape_comprehensive_current_affairs(days_back=5, max_articles_per_source=20)\n\n5. ACCESS RAW DATA:\n   scraper = CurrentAffairsScraper()\n   articles = scraper.scrape_all_sources()\n   scraper.save_results('json')  # Save as JSON\n   scraper.save_results('csv')   # Save as CSV\n\nFEATURES:\n✓ 25+ Global news sources (BBC, CNN, Reuters, Times of India, etc.)\n✓ Intelligent current affairs filtering\n✓ Automatic categorization (Politics, Economy, International, etc.)\n✓ Duplicate removal\n✓ Multiple output formats (Text, JSON, CSV)\n✓ Date-based filtering\n✓ Source statistics and analytics\n"

# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')