In [1]:
!pip install requests beautifulsoup4 feedparser pymongo rapidfuzz selenium transformers torch

import requests
import json
from bs4 import BeautifulSoup
import feedparser
from pymongo import MongoClient
from datetime import datetime
import pytz
from rapidfuzz import fuzz
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime, timedelta
from selenium.webdriver.common.action_chains import ActionChains
from transformers import pipeline
import re



In [2]:
# List of RSS feed URLs
rss_feeds = {
    "BBC": "http://feeds.bbci.co.uk/news/rss.xml",
    "CNN": "http://rss.cnn.com/rss/edition.rss",
    "Reuters": [
        "https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=deals&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=political-general&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=environment&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=health&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=lifestyle-entertainment&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=human-interest&post_type=best",
        "https://www.reutersagency.com/feed/?best-topics=journalist-spotlight&post_type=best",
    ],
    
    "TOI": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms", #The New York Times
    #"TNYT": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", #The New York Times
    #"TG": "https://www.theguardian.com/uk/rss", #The Guardian
    #"AJ": "https://www.aljazeera.com/xml/rss/all.xml" #Al Jazeera
}


In [3]:
# MongoDB Setup
client = MongoClient('mongodb://localhost:27017/')  # Update with your MongoDB connection URI
db = client['news_database']
collection = db['news_articles']

In [4]:
# Load a summarization model and sentiment analysis model
summarizer = pipeline('summarization')
sentiment_analyzer = pipeline('sentiment-analysis')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
def setup_driver():
    options = Options()
    options.add_argument('--headless')  # Runs Chrome in headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service('/opt/homebrew/bin/chromedriver')  # Update this path if necessary
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [6]:
def convert_relative_time_to_iso(relative_time):
    """
    Convert a relative time string like '21 hours ago' to an ISO timestamp.
    """
    # Get the current time
    current_time = datetime.utcnow()

    # Use regular expressions to extract time units and values
    match = re.match(r"(\d+)\s*(\w+)\s*ago", relative_time)
    if not match:
        return None  # Return None if the format is not recognized

    amount, unit = int(match.group(1)), match.group(2)

    # Convert the time unit into a timedelta object
    if "hour" in unit:
        time_delta = timedelta(hours=amount)
    elif "minute" in unit:
        time_delta = timedelta(minutes=amount)
    elif "second" in unit:
        time_delta = timedelta(seconds=amount)
    elif "day" in unit:
        time_delta = timedelta(days=amount)
    else:
        return None  # If it's a unit we don't handle, return None

    # Subtract the timedelta from the current time
    calculated_time = current_time - time_delta

    # Convert to ISO format
    return calculated_time.isoformat() + "Z"  # Adding 'Z' to denote UTC time


In [7]:
def convert_cnn_time(cnn_time):
    # Step 1: Remove "Updated" text
    cnn_time = cnn_time.replace("Updated", "").strip()

    # Step 2: Parse the time (assuming it's in Eastern Daylight Time - EDT)
    # We'll remove the EDT part and parse the rest of the time
    time_format = "%I:%M %p %Z, %a %B %d, %Y"
    cnn_time = cnn_time.replace("EDT", "Eastern")  # Handle EDT as "Eastern" for parsing
    dt = datetime.strptime(cnn_time, time_format)
    
    # Convert to UTC or desired timezone (for example, GMT)
    eastern = pytz.timezone('US/Eastern')
    dt_eastern = eastern.localize(dt)  # Localize the datetime to Eastern Time
    dt_utc = dt_eastern.astimezone(pytz.utc)  # Convert to UTC

    # Step 3: Convert to desired format (similar to BBC format)
    return dt_utc.strftime('%d %b %Y, %H:%M GMT')

# Test with your input example
#cnn_published_time = "Updated  4:09 PM EDT, Thu April 6, 2023"
#formatted_time = convert_cnn_time(cnn_published_time)
#print(formatted_time)

In [8]:
# Fetch full news content and additional details from the news article page
def fetch_full_article_bbc(url, driver):
    try:
        driver.get(url)
        #time.sleep(3)  # Wait for the page to load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Initialize fields
        full_text = ""
        published_time = ""
        published_time_raw = ""
        author_name = ""
        author_designation = ""
        reporting_location = ""
        images = []

        # Extract the article content
        article = soup.find('article')
        if article:
            # Extract paragraphs
            paragraphs = article.find_all('p')
            full_text = ' '.join([p.get_text() for p in paragraphs])
            #print(f"Full text: {full_text}")

            # Extract published time
            time_tag = article.find('time')
            if time_tag:
                published_time_raw = time_tag.get_text()                
                print(f"Time: {published_time_raw}")
                published_time = convert_relative_time_to_iso(published_time_raw)


            # Extract author name and designation
            byline_block = article.find('div', {'data-component': 'byline-block'})
            if byline_block:
                author_name_tag = byline_block.find('span', class_='bZCrck')
                if author_name_tag:
                    author_name = author_name_tag.get_text()

                author_designation_tag = byline_block.find('div', class_='hEbjLr')
                if author_designation_tag:
                    author_designation = author_designation_tag.get_text()

                # Extract reporting location
                reporting_location_tag = byline_block.find('span', string=lambda x: x and "Reporting from" in x)
                if reporting_location_tag:
                    reporting_location = reporting_location_tag.get_text().replace("Reporting from", "").strip()
        
        # Extract images within <figure> tags
        figures = soup.find_all('figure')
        for fig in figures:
            img = fig.find('img')
            if img and img.get('src'):
                # Some images might have relative URLs
                img_url = img['src']
                if img_url.startswith('//'):
                    img_url = 'https:' + img_url
                elif img_url.startswith('/'):
                    img_url = 'https://www.bbc.com' + img_url
                images.append(img_url)
        
        return {
            "description": full_text if full_text else "Full content not available",
            "published_time": published_time,
            "author_name": author_name,
            "author_designation": author_designation,
            "reporting_location": reporting_location,
            "images": images
        }
    
    except Exception as e:
        print(f"Failed to fetch full content from {url}: {e}")
        return {
            "description": "Failed to fetch full content",
            "published_time": "",
            "author_name": "",
            "author_designation": "",
            "reporting_location": "",
            "images": []
        }

In [9]:
# Function to fetch CNN article data
def fetch_full_article_cnn(url, driver):
    try:
        driver.get(url)
        #response = requests.get(url)
        #soup = BeautifulSoup(response.content, 'html.parser')
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 1. Extract the headline
        headline_tag = soup.find('h1', {'class': 'headline__text'})
        headline = headline_tag.get_text(strip=True) if headline_tag else 'Headline not found'
        print(f"headline_tag: {headline_tag}")
        print(f"headline: {headline}")

        # 2. Extract the author name
        author_tag = soup.find('div', {'class': 'byline__names'})
        author_name = author_tag.find('span', {'class': 'byline__name'}).get_text(strip=True) if author_tag else 'Author not found'
        print(f"author_tag: {author_tag}")
        print(f"author_name: {author_name}")
        
        # 3. Extract the published time
        timestamp_tag = soup.find('div', {'class': 'timestamp'})
        published_time_raw = timestamp_tag.get_text(strip=True) if timestamp_tag else 'Published time not found'        
        formatted_time = convert_cnn_time(published_time_raw)
        
        print(f"timestamp_tag: {timestamp_tag}")
        print(f"published_time: {published_time}")

        # 4. Extract images (multiple)
        image_tags = soup.find_all('div', {'class': 'image__lede article__lede-wrapper'})
        images = []
        for img_tag in image_tags:
            img = img_tag.find('img')
            if img and img.get('src'):
                images.append(img['src'])

        # 5. Extract description/content
        description_tag = soup.find('div', {'class': 'article__content-container'})
        description = description_tag.get_text(strip=True) if description_tag else 'Description not found'

        # Return the extracted data as a dictionary
        return {
            'headline': headline,
            'author_name': author_name,
            'published_time': published_time,
            'images': images,
            'description': description
        }
    
    except Exception as e:
        print(f"Failed to fetch full content from {url}: {e}")
        return {
            "description": "Failed to fetch full content",
            "published_time": "",
            "author_name": "",
            "author_designation": "",
            "reporting_location": "",
            "images": []
        }


In [10]:
def fetch_times_of_india_news():
    base_url = "https://timesofindia.indiatimes.com/india"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    news_snippets = soup.find_all('div', {'class': 'iN5CR'})

    # Loop through each snippet to extract individual news links
    for snippet in news_snippets:
        link_tag = snippet.find('a', href=True)
        if link_tag:
            article_url = link_tag['href']
            print(f"Fetching article from: {article_url}")

            # Fetch full article details
            article_data = fetch_full_article_times_of_india(article_url)
            if article_data:
                save_articles_to_mongo(article_data)

            time.sleep(1)  # Rate limiting

In [11]:
# Function to fetch the full Reuters article data
def fetch_full_article_reuters(rss_article_url, driver):
    try:
        # Step 1: Navigate to the RSS article page
        # Simulate mouse movement
        action = ActionChains(driver)
        element = driver.find_element_by_xpath("//button[@class='submit']")
        action.move_to_element(element).perform()

        # Randomize delay
        time.sleep(2 + random.random() * 3)  # Pause 2 to 5 seconds

        # Click the button to trigger a click event manually
        element.click()

        driver.get(rss_article_url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Step 2: Find the actual article link from the RSS article page
        article_link_tag = soup.find('a', href=True, text="exclusively reported")
        
        if not article_link_tag:
            print(f"Actual article link not found on {rss_article_url}")
            return {"description": "Failed to fetch full content", "author_name": "", "category": "", "headline": "Headline not found"}

        article_url = article_link_tag['href']
        print(f"Found article link: {article_url}")
        
        # Step 3: Navigate to the actual article page
        driver.get(article_url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Step 4: Extract title
        title_tag = soup.find('h1', {'data-testid': 'Heading'})
        headline = title_tag.get_text(strip=True) if title_tag else 'Headline not found'
        print(f"Title: {headline}")
        
        # Step 5: Extract author
        author_tag = soup.find('a', rel='author')
        author_name = author_tag.get_text(strip=True) if author_tag else 'Author not found'
        print(f"Author: {author_name}")
        
        # Step 6: Extract category
        category_tag = soup.find('ul', {'class': 'tags-with-tooltip__list__37vkr'})
        category = category_tag.get_text(strip=True) if category_tag else 'Category not found'
        print(f"Category: {category}")
        
        # Step 7: Extract description/content
        description_tag = soup.find('div', {'class': 'article-body__content__17Yit'})
        description = description_tag.get_text(strip=True) if description_tag else 'Description not found'
        print(f"Description: {description}")

        # Return the extracted data as a dictionary
        return {
            'headline': headline,
            'author_name': author_name,
            'category': category,
            'description': description,
            'published_time': datetime.now().isoformat(),  # You can replace this with actual fetched timestamp logic
        }

    except Exception as e:
        print(f"Failed to fetch full content from {rss_article_url}: {e}")
        return {
            "description": "Failed to fetch full content",
            "author_name": "",
            "category": "",
            "headline": "Headline not found"
        }

In [12]:
def fetch_full_article_times_of_india(url):
    """
    Fetches the full article details from a Times of India news page.
    """
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        title_tag = soup.find('h1', {'class': 'HNMDR'})
        title = title_tag.get_text(strip=True) if title_tag else 'Title not found'

        summary_tag = soup.find('div', {'class': 'M1rHh'})
        summary = summary_tag.get_text(strip=True) if summary_tag else 'Summary not found'

        byline_tag = soup.find('div', {'class': 'xf8Pm byline'})
        if byline_tag:
            byline_text = byline_tag.get_text(strip=True)
            parts = byline_text.split('|')
            author = parts[0].strip() if len(parts) > 0 else "Author not found"
            published_time = parts[1].strip() if len(parts) > 1 else "Date not found"
        else:
            author = "Author not found"
            published_time = "Date not found"

        try:
            published_time = datetime.strptime(published_time, '%b %d, %Y, %I:%M %p').isoformat()
        except ValueError:
            pass  # Keep original string if parsing fails

        content_tag = soup.find('div', {'data-articlebody': '1'})
        content = content_tag.get_text(strip=True) if content_tag else 'Content not found'

        return {
            'title': title,
            'summary': summary,
            'author': author,
            'published_time': published_time,
            'content': content,
            'url': url,
            'source': 'Times of India'  # Adding source field
        }

    except Exception as e:
        print(f"Error fetching article from {url}: {e}")
        return None

In [13]:
def assign_tag(entry):
    """Assign a specific tag to an article based on custom logic"""
    if 'breaking' in entry.title.lower():
        return 1  # Tag for breaking news
    elif 'politics' in entry.title.lower():
        return 2  # Tag for COVID-related news
    elif 'business' in entry.title.lower():
        return 3  # Tag for election news
    elif 'science' in entry.title.lower():
        return 4  # Tag for election news
    elif 'sports' in entry.title.lower():
        return 5  # Tag for election news
    elif 'arts' in entry.title.lower():
        return 6  # Tag for election news
    elif 'entertainment' in entry.title.lower():
        return 7  # Tag for election news
    elif 'travel' in entry.title.lower():
        return 8  # Tag for election news
    elif 'weather' in entry.title.lower():
        return 9  # Tag for election news
    elif 'earth' in entry.title.lower():
        return 10  # Tag for election news
    elif 'local' in entry.title.lower():
        return 11  # Tag for election news
    else:
        return 0  # Default tag for other news

In [14]:
def is_duplicate_article_other(new_article, existing_articles):
    """Check if the new article is a duplicate based on title similarity using rapidfuzz."""
    
    for article_item in existing_articles:
        if 'headline' in article_item:
            similarity = fuzz.token_set_ratio(new_article['headline'], article_item['headline'])
            if similarity > 85:  # Threshold for considering articles as duplicates
                print(f"Duplicate title found: {article_item['headline']}")
                return True
            else:
                return False
        else:
            return False

In [15]:
def is_duplicate_article(new_article, existing_articles):
    """
    Check if the new article is a duplicate based on title similarity and source.
    If the title matches exactly and the source is the same, it's considered a duplicate.
    """
    for article_item in existing_articles:
        # Ensure the existing article has a headline and source
        if 'headline' in article_item and 'source' in article_item:
            if article_item['headline'] == new_article['headline'] and article_item['source'] == new_article['source']:
                print(f"Duplicate title found in the same source: {article_item['headline']} (Source: {article_item['source']})")
                return True
        else:
            # If either headline or source is missing, we can't reliably check for duplicates
            continue
            
    return False


In [16]:
def mark_and_save_duplicate_articles():
    """Mark duplicate articles with the same tagId based on title similarity and save in the DB."""
    
    # Fetch all articles from the collection
    articles = list(collection.find())
    
    current_tag_id = 1  # Starting tagId

    # Track tagIds for already tagged groups of duplicates
    assigned_tag_ids = {}

    for i, new_article in enumerate(articles):
        if 'tagId' in new_article and new_article['tagId'] is not None:
            continue  # Skip already tagged articles

        # If the article is not tagged, assign a new tagId
        new_article_tag_id = current_tag_id

        # Update the current article with the tagId
        new_article['_id']  # Get the ID of the new article
        collection.update_one({'_id': new_article['_id']}, {'$set': {'tagId': new_article_tag_id}})

        # Compare the current article with the rest to find duplicates
        for j in range(i + 1, len(articles)):
            existing_article = articles[j]

            if 'headline' in existing_article:
                similarity = fuzz.token_set_ratio(new_article['headline'], existing_article['headline'])

                if similarity > 85:
                    print(f"Duplicate title found: {existing_article['headline']}")

                    # Check if the existing article already has a tagId
                    if 'tagId' in existing_article and existing_article['tagId'] is not None:
                        # Use the existing article's tagId for the current article group
                        new_article_tag_id = existing_article['tagId']
                    else:
                        # Assign the current tagId to the existing duplicate article
                        existing_article['tagId'] = new_article_tag_id
                        collection.update_one({'_id': existing_article['_id']}, {'$set': {'tagId': new_article_tag_id}})

        # Increment the tagId only if it was a unique article (no duplicates found)
        if new_article_tag_id == current_tag_id:
            current_tag_id += 1


In [17]:
def save_articles_to_mongo(articles):
    """Save articles to MongoDB, avoiding duplicates based on title similarity across sources."""
    # Fetch all existing articles from the database for comparison
    existing_articles = list(collection.find({}, {'headline': 1}))  # Only fetch titles for comparison

    for article in articles:
        # Check for duplicate articles based on title similarity
        if not is_duplicate_article(article, existing_articles):
            collection.insert_one(article)
            print(f"Saved article: {article['headline']}")
        else:
            print(f"Duplicate article skipped: {article['headline']}")


In [18]:
def clean_and_summarize_text(text, max_len=50, min_len=20):
    """
    Clean sensationalism and summarize the input text (title or description).
    """
    # Summarize the input text (if needed)
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
    
    # Bolden important points in the summary
    cleaned_summary = bolden_important_points(summary)
    
    return cleaned_summary

In [19]:
def bolden_important_points(text):
    """
    Bolden important points in the article text (e.g., first sentence, named entities).
    """
    # Split the summary into sentences
    sentences = text.split('. ')
    
    # Bolden first sentence or key information
    if sentences:
        sentences[0] = f"<b>{sentences[0]}</b>"
    
    # Join sentences back into a single string
    boldened_text = '. '.join(sentences)
    
    return boldened_text

In [20]:
# Retrieve all articles from the collection
#articles = []
#articles = list(collection.find())
#mark_and_save_duplicate_articles();

def clean_sensationalism(article_text):
    """
    Cleans sensationalism by detecting overly emotional or sensational language 
    and rephrasing the text to focus on the facts.
    """
    
    # Break the text into sentences
    if not article_text:  # Check if article_text is None or empty
        return "Content not available or cannot be cleaned."
    
    sentences = article_text.split('. ')
    factual_sentences = []
    print(f"Sentences: {sentences}")

    # Analyze each sentence
    for sentence in sentences:
        sentiment = sentiment_analyzer(sentence)
        if sentiment[0]['label'] in ['NEGATIVE', 'POSITIVE'] and sentiment[0]['score'] > 0.7:
            # Skip overly emotional sentences or rewrite them
            print(f"Skipping sensational sentence: {sentence}")
            continue
        factual_sentences.append(sentence)

    # Join the factual sentences
    clean_text = '. '.join(factual_sentences)
    
    # Use summarization to condense the cleaned text
    summary = summarizer(clean_text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    
    return summary

# Example usage with article data
article_text = """
    This shocking event has left the whole world in disbelief! You won't believe what happened next. 
    After years of struggle, the company has finally decided to change its strategy. 
    Experts say that this could lead to a market crash of unprecedented proportions. 
    The fact is, the company announced its new plans in a statement released yesterday.
"""



In [None]:
for article in articles:
# Clean the article of sensationalism and rephrase
    if article.get('description') != 'Full content not available':
        cleaned_headline = clean_sensationalism(article.get('headline'))
        cleaned_description = clean_sensationalism(article.get('description'))
        #print("Cleaned and concise article: ")
        print(f"{cleaned_headline}\n")        
        print(f"{cleaned_description}\n")
        
        # Update the document with the new fields
        collection.update_one(
            {'_id': article['_id']},
            {'$set': {
                'cleaned_summary': cleaned_description,
                'cleaned_headline': cleaned_headline
            }}
        )
        


In [21]:
def determine_category(article_content):
    """
    Determines the category of the article based on its content.
    """
    # Define keywords for each category
    categories = {
        "Breaking": ["breaking", "urgent", "just in"],
        "Politics": ["government", "policy", "election", "political"],
        "Business": ["business", "market", "stocks", "finance", "economy"],
        "Science": ["research", "scientists", "study", "laboratory", "experiment"],
        "Sports": ["game", "match", "tournament", "league", "athlete"],
        "Art & Culture": ["art", "museum", "culture", "festival", "heritage"],
        "Entertainment": ["celebrity", "movie", "music", "show", "award"],
        "Travel": ["travel", "tourism", "destination", "flight", "hotel"],
        "Weather": ["weather", "storm", "temperature", "forecast"],
        "Earth": ["climate", "environment", "sustainability", "nature"],
        "Local": ["local", "community", "neighborhood", "town"]
    }

    # Default category
    assigned_category = "Top Stories"

    # Check for category keywords
    for category, keywords in categories.items():
        if any(keyword in article_content.lower() for keyword in keywords):
            assigned_category = category
            break
    print(f"Assigned category: {assigned_category}")
    return assigned_category

# Example usage with an article
#article_content = """The government has announced a new policy that will affect the election process."""
#category = determine_category(article_content)
#print(f"Assigned category: {category}")  # Should print "Politics"


In [31]:
def parse_rss_feed(rss_url, driver, portal):
    feed = feedparser.parse(rss_url)
    
    news_data = []
    # Fetch existing titles from MongoDB for duplicate checking
    existing_titles = list(collection.find({}, {'headline': 1}))
    #print(f"Skipping duplicate titles: {existing_titles}")
    existing_titles = [item['headline'] for item in existing_titles]

    for entry in feed.entries:
        headline = entry.title
        url = entry.link
        article_details = ''

        # Check for duplicates
        if is_duplicate_article({"headline": headline}, existing_titles):
            print(f"Duplicate article skipped: {headline}")
            continue
            
        
        print(f"Portal: {portal}")

        # Fetch additional details from the article page
        if portal == 'bbc':
            article_details = fetch_full_article_bbc(url, driver)
        elif portal == 'cnn':
            article_details = fetch_full_article_cnn(url, driver)
        elif portal == 'reuters':
            article_details = fetch_full_article_reuters(url, driver)
        elif portal == 'toi':
            article_details = fetch_full_article_times_of_india(url, driver)
            
        if article_details.get("description", "") != "":
            category = determine_category(article_details.get("description", ""))

            print(f"Headline: {headline}")  # Should print the Headline
            

            # Use .get() to avoid KeyError for missing fields
            news_item = {
                "headline": headline,
                "url": url,
                "description": article_details.get("description", ""),
                "published_time": article_details.get("published_time", ""),
                "author_name": article_details.get("author_name", ""),
                "author_designation": article_details.get("author_designation", ""),  # Optional field
                "reporting_location": article_details.get("reporting_location", ""),  # Optional field
                "images": article_details.get("images", []),  # Optional field, default to empty list
                "source": portal,
                "category": category,
                "fetched_at": datetime.now()
            }
            #print(news_item)  # Print news details to the console
            news_data.append(news_item)
    
    return news_data

In [23]:
def crawl_news():
    """Crawl all news outlets and save to MongoDB"""    
    driver = setup_driver()
    
    try:
        # Loop through RSS feeds
        for portal, rss_url in rss_feeds.items():
            #print(f"Fetching RSS URL {rss_url}")
            
            # Check if the portal is "Reuters"
            if portal == "Reuters":
                # rss_url is a list, so loop through it directly
                for rss_sub_url in rss_url:
                    print(f"Fetching news from {rss_sub_url}")
                    # Call your parsing function for each sub-URL
                    news_data = parse_rss_feed(rss_sub_url, driver, portal.lower().replace(' ', '_'))
                    save_articles_to_mongo(news_data)
            else:
                # Handle other portals with single URL feeds
                news_data = parse_rss_feed(rss_url, driver, portal.lower().replace(' ', '_'))
                save_articles_to_mongo(news_data)
    
    finally:
        driver.quit()


In [None]:
# Example usage
articles = fetch_times_of_india_news()
for article in articles:
    print(article)

In [32]:
if __name__ == "__main__":
    crawl_news()

Portal: bbc
Time: 3 hours ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Time: 8 hours ago
Assigned category: Art & Culture
Assigned category: Art & Culture
Portal: bbc
Assigned category: Top Stories
Assigned category: Top Stories
Portal: bbc
Time: 1 hour ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Time: 4 hours ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Time: 2 hours ago
Assigned category: Breaking
Assigned category: Breaking
Portal: bbc
Time: 2 hours ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Time: 14 hours ago
Assigned category: Business
Assigned category: Business
Portal: bbc
Time: 14 hours ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Assigned category: Top Stories
Assigned category: Top Stories
Portal: bbc
Time: 1 day ago
Assigned category: Politics
Assigned category: Politics
Portal: bbc
Time: 7 hours ago
Assigned category: Art & Culture
Assign

headline_tag: None
headline: Headline not found
author_tag: <div class="byline__names">
			By <a class="byline__link" href="https://www.cnn.com/profiles/tara-subramaniam"><span class="byline__name">Tara Subramaniam</span></a>, <span class="byline__name">Jack Guy</span>, <a class="byline__link" href="https://www.cnn.com/profiles/aditi-sandal"><span class="byline__name">Aditi Sangal</span></a>, <a class="byline__link" href="https://www.cnn.com/profiles/maureen-chowdhury"><span class="byline__name">Maureen Chowdhury</span></a> and <span class="byline__name">Mike Hayes</span>, CNN
		</div>
author_name: Tara Subramaniam
Failed to fetch full content from https://edition.cnn.com/webview/europe/live-news/russia-ukraine-war-news-04-03-23/index.html: time data '10:00 PM Eastern, Mon April 3, 2023' does not match format '%I:%M %p %Z, %a %B %d, %Y'
Assigned category: Top Stories
Assigned category: Top Stories
Portal: cnn
headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-tex

headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      The $500 billion beauty industry’s ‘green’ ambitions are a patchwork at best. And they’re falling short
    </h1>
headline: The $500 billion beauty industry’s ‘green’ ambitions are a patchwork at best. And they’re falling short
author_tag: <div class="byline__names">
<a class="byline__link" href="https://www.cnn.com/profiles/aditi-sandal"><span class="byline__name">Aditi Sangal</span></a>, CNN
		</div>
author_name: Aditi Sangal
Failed to fetch full content from https://www.cnn.com/style/article/beauty-skincare-climate/index.html: time data '4:09 PM Eastern, Thu April 6, 2023' does not match format '%I:%M %p %Z, %a %B %d, %Y'
Assigned category: Top Stories
Assigned category: Top Stories
Portal: cnn
headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      A new approach to a Covid-1

headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      CEOs are tired of being held responsible for gun regulation
    </h1>
headline: CEOs are tired of being held responsible for gun regulation
author_tag: <div class="byline__names">
			By <a class="byline__link" href="https://www.cnn.com/profiles/nicole-goodkind"><span class="byline__name">Nicole Goodkind</span></a>, CNN
		</div>
author_name: Nicole Goodkind
Failed to fetch full content from https://www.cnn.com/2023/04/03/investing/premarket-stocks-trading/index.html: time data 'Published\n        7:19 AM Eastern, Mon April 3, 2023' does not match format '%I:%M %p %Z, %a %B %d, %Y'
Assigned category: Top Stories
Assigned category: Top Stories
Portal: cnn
headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      Opinion: ‘Until we meet again, brave little cat.’ The heartbreak and tabo

headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      Elon Musk’s Twitter promised a purge of blue check marks. Instead he singled out one account
    </h1>
headline: Elon Musk’s Twitter promised a purge of blue check marks. Instead he singled out one account
author_tag: <div class="byline__names">
			By <a class="byline__link" href="https://www.cnn.com/profiles/clare-duffy"><span class="byline__name">Clare Duffy</span></a>, CNN
		</div>
author_name: Clare Duffy
Failed to fetch full content from https://www.cnn.com/2023/04/03/tech/twitter-blue-checks/index.html: time data '4:06 PM Eastern, Mon April 3, 2023' does not match format '%I:%M %p %Z, %a %B %d, %Y'
Assigned category: Top Stories
Assigned category: Top Stories
Portal: cnn
headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      See the world’s deepest fish
    </h1>
headline:

headline_tag: <h1 class="headline__text inline-placeholder vossi-headline-text" data-editable="headlineText" id="maincontent">
      Shania Twain calls for equal play and more diversity in country music
    </h1>
headline: Shania Twain calls for equal play and more diversity in country music
author_tag: <div class="byline__names">
			By <a class="byline__link" href="https://www.cnn.com/profiles/lisa-france"><span class="byline__name">Lisa Respers France</span></a>, CNN
		</div>
author_name: Lisa Respers France
Failed to fetch full content from https://www.cnn.com/2023/04/03/entertainment/shania-twain-equal-diversity/index.html: time data '12:29 PM Eastern, Mon April 3, 2023' does not match format '%I:%M %p %Z, %a %B %d, %Y'
Assigned category: Top Stories
Assigned category: Top Stories
Saved article: Trump pleads not guilty to 34 felony counts
Saved article: Haberman reveals why Trump attacked judge and his family in speech
Saved article: What to know about the Trump indictment on the e

TypeError: fetch_full_article_times_of_india() takes 1 positional argument but 2 were given