In [6]:
from bs4 import BeautifulSoup
import requests
import time
from random import randint
from requests.exceptions import RequestException

In [5]:
def get_headers():
    """Generate headers that look like a real browser."""
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

def fetch_with_retry(url, max_retries=3, delay=2):
    """Fetch URL with retry logic and error handling."""
    for attempt in range(max_retries):
        try:
            # Add a small random delay to be polite to the server
            time.sleep(delay + randint(1, 3))
            
            # Make the request with proper headers
            response = requests.get(url, headers=get_headers(), timeout=10)
            response.raise_for_status()
            return response
            
        except RequestException as e:
            if attempt == max_retries - 1:  # Last attempt
                print(f"Error fetching {url}: {str(e)}")
                raise
            print(f"Attempt {attempt + 1} failed, retrying...")
            time.sleep(delay * (attempt + 1))  # Exponential backoff
    return None

In [7]:
try:
    # Use our fetch_with_retry function instead of direct requests.get
    url = "https://www.newsnow.com/ng/Economy/Cryptocurrencies/"
    page = fetch_with_retry(url)
    
    if page:
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="nws-article")
        
        if not results:
            print("No articles found. The page structure might have changed or blocking might be in place.")
            print("\nFirst 200 characters of the response:")
            print(soup.prettify()[:200])
        else:
            print(f"Found {len(results)} articles:\n")
            for article in results:
                try:
                    title = article.find("h3", class_="nws-article__headline")
                    link = article.find("a")
                    
                    if title and link:
                        print(f"Title: {title.get_text().strip()}")
                        print(f"Link: {link['href']}\n")
                    else:
                        print("Skipping article - missing title or link\n")
                        
                except Exception as e:
                    print(f"Error processing article: {str(e)}\n")
                    continue
                    
except Exception as e:
    print(f"Error: {str(e)}")

No articles found. The page structure might have changed or blocking might be in place.

First 200 characters of the response:
<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Crypto News | Latest Cryptocurrency News - NewsNow
  </title>
  <link href="//c.newsnow.com" rel="dns-prefetch"/>
  <meta content=


In [10]:
from requests_html import HTMLSession

session = HTMLSession()
url = "https://www.newsnow.com/ng/Economy/Cryptocurrencies/"

r = session.get(url)
r.html.render(timeout=20)  # this runs the JavaScript

articles = r.html.find('div.nws-article')

for article in articles:
    title = article.find('h3.nws-article__headline', first=True)
    link = article.find('a', first=True)
    if title and link:
        print(f"Title: {title.text.strip()}")
        print(f"Link: {link.attrs.get('href')}")


RuntimeError: Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.

In [11]:
from requests_html import AsyncHTMLSession

asession = AsyncHTMLSession()

async def get_articles():
    url = "https://www.newsnow.com/ng/Economy/Cryptocurrencies/"
    r = await asession.get(url)
    await r.html.arender(timeout=20)
    articles = r.html.find('div.nws-article')
    for article in articles:
        title = article.find('h3.nws-article__headline', first=True)
        link = article.find('a', first=True)
        if title and link:
            print(f"Title: {title.text.strip()}")
            print(f"Link: {link.attrs.get('href')}")

# Run the async function
await get_articles()

[INFO] Starting Chromium download.


OSError: Chromium downloadable not found at https://storage.googleapis.com/chromium-browser-snapshots/Win_x64/1181205/chrome-win.zip: Received <?xml version='1.0' encoding='UTF-8'?><Error><Code>NoSuchKey</Code><Message>The specified key does not exist.</Message><Details>No such object: chromium-browser-snapshots/Win_x64/1181205/chrome-win.zip</Details></Error>.
