In [1]:
import requests
from bs4 import BeautifulSoup as bs


In [2]:
__urlAll="https://www.almayadeen.net/sitemaps/all.xml"


In [3]:
def get_xml(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"

In [4]:
def parse_xml(xml_content):
    soup = bs(xml_content, 'xml')
    urls = [loc.text for loc in soup.find_all('loc')]
    return urls

In [5]:
# Get the 10 first URLs from the XML
def get_all():
    xml_content = get_xml(__urlAll)
    if not xml_content.startswith("Error"):
        urls = parse_xml(xml_content)
        if len(urls) > 10:
            return urls[:10]  
        else:
            return urls
    else:
        return xml_content

In [6]:
def get_all_links():
    urls = get_all()
    news = []
    for url in urls:
        xml_content = get_xml(url)
        if not xml_content.startswith("Error"):
            news.append(parse_xml(xml_content))
    return news

In [7]:
print(get_all_links())

[['https://www.almayadeen.net/news/politics/الميادين-في-لقاء-خاص-مع--أبو-شجاع--الاسم-الذي-يؤرق-الإسرائيل', 'https://www.almayadeen.net/news/politics/شهيد-وحرق-منازل-وسيارات---اعتداءات-دموية-للمستوطنين-في-قلقيل', 'https://www.almayadeen.net/news/politics/المقاومة-تستهدف-قوات-الاحتلال-في--نتساريم----ومروحياته-تجلي', 'https://www.almayadeen.net/newscast/2024/8/15/المسائية---موقف-حماس-يحرج-الطرف-الإسرائيلي-أمام-الوسطاء', 'https://www.almayadeen.net/news/politics/قاسم--الرد-على-اغتيال-شكر-منفصل-عن-العدوان-على-غزة-ووقف-إطلا', 'https://www.almayadeen.net/news/economic/بسبب-العمليات-اليمنية-في-البحر-الأحمر---أرباح--موانئ-دبي-الع', 'https://www.almayadeen.net/tv-reports/هجوم-أوكرانيا-على-كورسك-الروسية--مغامرة-غير-محسوبة-النتائج', 'https://www.almayadeen.net/news/politics/حزب-الله-في-ذكرى-حرب-تموز--ملتزمون-بالوقوف-إلى-جانب-الشعب-ال', 'https://www.almayadeen.net/tv-reports/محمود-عباس--القدس-خط-أحمر-والمنطقة-لن-تستقر-دون-حل-للقضية-ال', 'https://www.almayadeen.net/tv-reports/قبل-إطلاقها-أي-صاروخ---

In [8]:
print(len(get_all_links()))

10


In [9]:
def scrape_article(url):
    html_content = get_xml(url)
    if html_content.startswith("Error"):
        return html_content
    soup = bs(html_content, 'html.parser')
    # Extract the title, paragraph, and date within <div class="details-white-box">
    details_white_box = soup.find('div', class_='details-white-box')
    title = details_white_box.find('h2').text if details_white_box and details_white_box.find('h2') else 'No title found'
    paragraph = details_white_box.find('p', class_='lg_para summary').text if details_white_box and details_white_box.find('p', class_='lg_para summary') else 'No paragraph found'
    date = details_white_box.find('li', class_='single-meta').text if details_white_box and details_white_box.find('li', class_='single-meta') else 'No date found'
    # Extract the image URL and caption within <div class="post-image paragraph-right">
    post_image_div = soup.find('div', class_='post-image paragraph-right')
    image_url = post_image_div.find('img')['src'] if post_image_div and post_image_div.find('img') else 'No image found'
    image_caption = post_image_div.find('div', class_='image-caption').text if post_image_div and post_image_div.find('div', class_='image-caption') else 'No caption found'
    return {
        "title": title,
        "paragraph": paragraph,
        "date": date,
        "image_url": image_url,
        "image_caption": image_caption
    }

In [10]:
urls = get_all()
if isinstance(urls, list):
    for url in urls:
        article_data = scrape_article(url)
        print(article_data)
    print("---------------------------------------------------------")
else:
    print(urls)

{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', 'date': 'No date found', 'image_url': 'No image found', 'image_caption': 'No caption found'}
{'title': 'No title found', 'paragraph': 'No paragraph found', '

In [11]:
print(urls)

['https://www.almayadeen.net/sitemaps/all/sitemap-2024-8.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-7.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-6.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-5.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-4.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-3.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-2.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2024-1.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2023-12.xml', 'https://www.almayadeen.net/sitemaps/all/sitemap-2023-11.xml']


In [12]:
print(get_all_links()[:10])

[['https://www.almayadeen.net/news/politics/الميادين-في-لقاء-خاص-مع--أبو-شجاع--الاسم-الذي-يؤرق-الإسرائيل', 'https://www.almayadeen.net/news/politics/شهيد-وحرق-منازل-وسيارات---اعتداءات-دموية-للمستوطنين-في-قلقيل', 'https://www.almayadeen.net/news/politics/المقاومة-تستهدف-قوات-الاحتلال-في--نتساريم----ومروحياته-تجلي', 'https://www.almayadeen.net/newscast/2024/8/15/المسائية---موقف-حماس-يحرج-الطرف-الإسرائيلي-أمام-الوسطاء', 'https://www.almayadeen.net/news/politics/قاسم--الرد-على-اغتيال-شكر-منفصل-عن-العدوان-على-غزة-ووقف-إطلا', 'https://www.almayadeen.net/news/economic/بسبب-العمليات-اليمنية-في-البحر-الأحمر---أرباح--موانئ-دبي-الع', 'https://www.almayadeen.net/tv-reports/هجوم-أوكرانيا-على-كورسك-الروسية--مغامرة-غير-محسوبة-النتائج', 'https://www.almayadeen.net/news/politics/حزب-الله-في-ذكرى-حرب-تموز--ملتزمون-بالوقوف-إلى-جانب-الشعب-ال', 'https://www.almayadeen.net/tv-reports/محمود-عباس--القدس-خط-أحمر-والمنطقة-لن-تستقر-دون-حل-للقضية-ال', 'https://www.almayadeen.net/tv-reports/قبل-إطلاقها-أي-صاروخ---