In [5]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import random

# Define a custom user-agent header to mimic a browser
user_agent = f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 100)}.0.0.0 Safari/537.36"

def scrape_news(search_term="reliance industries ltd", max_retries=3):
  """
  Scrapes Google News for articles mentioning the search term within the last 24 hours.

  Args:
      search_term (str, optional): The search term to use for Google News. Defaults to "reliance industries ltd".
      max_retries (int, optional): The maximum number of retries in case of network errors. Defaults to 3.

  Returns:
      list: A list of dictionaries containing the article URL and text snippet (if successful).
              An empty list if scraping fails after retries.
  """

  news_articles = []
  current_time = datetime.now()

  for attempt in range(max_retries + 1):
    try:
      # Construct Google News search URL
      url = f"https://news.google.com/search?q={search_term}&hl=en-IN&gl=IN&ceid=IN%3Aen"

      # Send a GET request with a user-agent header
      headers = {'User-Agent': user_agent}
      response = requests.get(url, headers=headers)
      response.raise_for_status()  # Raise an exception for non-200 status codes

      # Parse the HTML content
      soup = BeautifulSoup(response.text, 'html.parser')

      # Find links and dates (logic similar to previous version)
      links = soup.find_all('a', class_='JtKRv')
      dates = soup.find_all('time', class_='hvbAAd')

      for date, link in zip(dates, links):
        date_text = date.get_text(strip=True)

        if 'hours' in date_text or 'hour' in date_text:
          hours_ago = int(date_text.split()[0])
          publication_date = current_time - timedelta(hours=hours_ago)

          if publication_date >= current_time - timedelta(hours=24):
            href = link.get('href')
            if href and 'article' in href:
              article_url = 'https://www.google.com' + href.lstrip('.')
              article_text = link.get_text()

              news_articles.append({'source': article_url, 'text': article_text})

      return news_articles

    except requests.exceptions.RequestException as e:
      # Handle network errors and retry if within attempts limit
      print(f"Error occurred during attempt {attempt + 1}: {e}")
      if attempt < max_retries:
        print(f"Retrying in 5 seconds...")
        time.sleep(5)
      else:
        print("Maximum retries reached. Scraping failed.")
        return []  # Return empty list on failure

  # Shouldn't reach here, but return empty list in case
  return []

# Example usage
result = scrape_news()

if result:
  print(result)
else:
  print("No articles found or scraping failed.")


[{'source': 'https://www.google.com/articles/CBMie2h0dHBzOi8vaW5kaWFuZXhwcmVzcy5jb20vYXJ0aWNsZS9pbmRpYS9kaXJlY3RvcnMtb2Ytb3duZXItZmlybXMtb2YtZG9ub3Itbm8tMy1xd2lrLXN1cHBseS1hcmUtcmVsaWFuY2UtZXhlY3V0aXZlcy05MjE3MDQ2L9IBgAFodHRwczovL2luZGlhbmV4cHJlc3MuY29tL2FydGljbGUvaW5kaWEvZGlyZWN0b3JzLW9mLW93bmVyLWZpcm1zLW9mLWRvbm9yLW5vLTMtcXdpay1zdXBwbHktYXJlLXJlbGlhbmNlLWV4ZWN1dGl2ZXMtOTIxNzA0Ni9saXRlLw?hl=en-IN&gl=IN&ceid=IN%3Aen', 'text': 'Directors of owner firms of Donor No. 3 Qwik Supply are Reliance executives'}, {'source': 'https://www.google.com/articles/CBMiigFodHRwczovL3RpbWVzb2ZpbmRpYS5pbmRpYXRpbWVzLmNvbS9pbmRpYS8yLXF3aWstc3VwcGx5LWRpcmVjdG9ycy1saW5rZWQtdG8tcmVsaWFuY2UtZW50aXRpZXMtbm90LXN1YnNpZGlhcnktcmlsL2FydGljbGVzaG93LzEwODUzNDA2MC5jbXPSAY4BaHR0cHM6Ly90aW1lc29maW5kaWEuaW5kaWF0aW1lcy5jb20vaW5kaWEvMi1xd2lrLXN1cHBseS1kaXJlY3RvcnMtbGlua2VkLXRvLXJlbGlhbmNlLWVudGl0aWVzLW5vdC1zdWJzaWRpYXJ5LXJpbC9hbXBfYXJ0aWNsZXNob3cvMTA4NTM0MDYwLmNtcw?hl=en-IN&gl=IN&ceid=IN%3Aen', 'text': '2 Qwik Supply direct