In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime

class RSSScraper:
    def __init__(self, rss_url):
        self.rss_url = rss_url

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None

    def parse_rss(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text
            news['description'] = item.find('description').text

            # Convert pubDate to timestamp
            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())

            news_items.append(news)

        return news_items

    def save_news(self, news_items):
        for news in news_items:
            print(news)

    def run(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            self.save_news(news_items)

# Example usage
rss_url = "https://dailyhodl.com/feed/"
scraper = RSSScraper(rss_url)
scraper.run()


{'title': 'Top Analyst Predicts One More Leg Down for Altcoins Before Fresh Rallies, Updates Forecast on Bitcoin', 'link': 'https://dailyhodl.com/2024/08/20/top-analyst-predicts-one-more-leg-down-for-altcoins-before-fresh-rallies-updates-forecast-on-bitcoin/', 'pubDate': 'Tue, 20 Aug 2024 11:45:46 +0000', 'description': '<img width="1024" height="600" src="https://dailyhodl.com/wp-content/uploads/2024/08/Before-Fresh-Rallies.jpg?w=1024" class="webfeedsFeaturedVisual wp-post-image" alt="" style="display: block; margin-bottom: 5px; clear:both;max-width: 100%;" link_thumbnail="" decoding="async" fetchpriority="high" srcset="https://dailyhodl.com/wp-content/uploads/2024/08/Before-Fresh-Rallies.jpg 1366w, https://dailyhodl.com/wp-content/uploads/2024/08/Before-Fresh-Rallies.jpg?resize=300,176 300w, https://dailyhodl.com/wp-content/uploads/2024/08/Before-Fresh-Rallies.jpg?resize=768,450 768w, https://dailyhodl.com/wp-content/uploads/2024/08/Before-Fresh-Rallies.jpg?resize=1024,600 1024w, htt

In [None]:
class DailyhodlScrapper(Scraper):

    def __init__(self):
        super().__init__('https://dailyhodl.com/news/')
        self.page = page
        back_to_back = back_to_back




    def convert_relative_time(self, relative_time_str):
        """Convert relative time like '13 hours ago' to a UNIX timestamp."""
        current_time = datetime.now()


        parts = relative_time_str.split()
        number = int(parts[0])
        unit = parts[1].lower()


        if "hour" in unit:
            past_time = current_time - timedelta(hours=number)
        elif "minute" in unit:
            past_time = current_time - timedelta(minutes=number)
        elif "day" in unit:
            past_time = current_time - timedelta(days=number)
        elif "week" in unit:
            past_time = current_time - timedelta(weeks=number)
        else:
            past_time = current_time

        return int(past_time.timestamp())



    def getnews(self):
        newsItem = []
        if self.soup:

            articles =  soup.find_all("article", class_="jeg_post")

            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.ProviderUrl + link
                article = self.loadPage(article_url)

                news = {}

                title = article.find('h1', class_ = 'jeg_post_title')
                if title:
                    news['title'] = title.text.strip()

                description = article.find("div", class_="content-inner")
                if description:

                    description_paragraphs = description.find_all(['p','h2'])
                    description_text = ''.join([para.text for para in description_paragraphs])
                    news['description'] = description_text


                pub_date = article.find("div", class_="jeg_meta_date").text.strip()
                if pub_date:
                    date_str = pub_date.text.strip()


                if "ago" in date_str:
                    news['pubDate'] = self.convert_relative_time(date_str)
                else:

                    date_format = "%B %d, %Y"
                    date_obj = datetime.strptime(date_str, date_format)
                    news['pubDate'] = int(date_obj.timestamp())



                news['link'] = article_url


                news['category'] = article_url.split('/')[-3]
                # if category:
                #     tags =[]
                #     for tag in category :
                #         tags.append(tag.text.strip())
                #     news['category'] = tags

                img_thum_div = article.find("div", class_="thumbnail-container")
                img_thum = img_thum_div.find("img")["src"] if img_thum_div else None


                news['imgs'] = [img["src"] for img in description.find_all("img")]

                if img_thum:
                    news['thImage'] = img_thum

                creator = article.find("div", class_="jeg_meta_author")
                if creator:
                    news['creator'] = creator.text.strip()

                newsItem.append(news)

        return newsItem

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('description', ''))


            item['pubDate']  = newsItem.get('pubDate')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'newsbtc'


            item['summary'] = ''


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('imgs', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('creator')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:

                    self.soup = self.loadPage(self.ProviderUrl)
                    now = datetime.now()
                    logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
                    logger.info('+---------------------------------------------+')
                    if self.soup:
                        newsItems = self.getnews()
                        self.savegroupNews(newsItems)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



#base

In [1]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.8.0


In [4]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from abc import ABC, abstractmethod
from datetime import datetime , timedelta
from html import unescape

class BaseScraper(ABC):
    def __init__(self, mongo_uri, db_name, collection_name):
        self.client = MongoClient(mongo_uri)
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        self.headers = headers = {
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    def load_page(self, url):
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return BeautifulSoup(response.content, "html.parser")
        else:
            print(f"Failed to retrieve page: {url}")
            return None

    @abstractmethod
    def scrape_news(self):
        pass

#########################################################################################################################################################

class CoingapeScraper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url = "https://coingape.com"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url

    def get_article_body(self, article_url):
        soup = self.load_page(article_url)
        if soup:
            title = soup.find("h1", class_="entry-title").text.strip()
            summary = soup.find("div", class_='postsummary').text.strip()
            author = soup.find("span", class_="auth-name").text.strip()
            publish_date = soup.find("div", class_="publishby").text.strip()
            img_thum_div = soup.find("div", class_="imgthum")
            img_thum = img_thum_div.find("img")["src"] if img_thum_div else None

            main = soup.find("div", class_="main")
            imgs = [img["src"] for img in main.find_all("img")]

            tags = [tag.text.strip() for tag in soup.find_all("a", rel="tag")]
            content = main.text.strip()
            scraped_date = datetime.now()

            article_data = {
                "title": title,
                "url": article_url,
                "summary": summary,
                "author": author,
                "publish_date": publish_date,
                "tags": tags,
                'img_thum': img_thum,
                'images': imgs,
                "content": content,
                'scraped_date':scraped_date
            }

            return article_data
        else:
            return None

    def scrape_news(self):
        url = self.base_url + "/trending/"
        soup = self.load_page(url)
        if soup:
            articles = soup.find_all("div", class_="newscoverga")
            print(f'Number of articles found : {len(articles)}')
            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.base_url + link
                article_data = self.get_article_body(article_url)
                if article_data:
                    self.collection.insert_one(article_data)
                    print(f"Saved to MongoDB: {article_data['title']}")

#########################################################################################################################################################

class DailyhodlScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://dailyhodl.com'):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url

    def get_article_body(self, article_url):
        soup = self.load_page(article_url)
        if soup:
            title = soup.find("h1", class_="jeg_post_title").text.strip()
            author = soup.find("div", class_="jeg_meta_author").text.strip()
            publish_date = soup.find("div", class_="jeg_meta_date").text.strip()
            img_thum_div = soup.find("div", class_="thumbnail-container")
            img_thum = img_thum_div.find("img")["src"] if img_thum_div else None

            main = soup.find("div", class_="content-inner")
            imgs = [img["src"] for img in main.find_all("img")]

            tags = [tag.text.strip() for tag in soup.find_all("a", rel="tag")]
            content = main.text.strip()
            scraped_date = datetime.now()

            article_data = {
                "title": title,
                "url": article_url,
                "author": author,
                "publish_date": publish_date,
                "tags": tags,
                'img_thum': img_thum,
                'images': imgs,
                "content": content,
                "scraped_date":scraped_date
            }

            return article_data
        else:
            return None

    def scrape_news(self):

        url = self.base_url + "/news/"
        soup = self.load_page(url)
        if soup:
            articles = soup.find_all("article", class_="jeg_post")
            print(f'Number of articles found : {len(articles)}')
            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.base_url + link
                article_data = self.get_article_body(article_url)
                if article_data:
                    self.collection.insert_one(article_data)
                    print(f"Saved to MongoDB: {article_data['title']}")


#########################################################################################################################################################


class FinboldScraper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://finbold.com/category/cryptocurrency-news/'):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url

    def get_article_body(self, article_url):
        article = self.load_page(article_url)
        if article:

            news = {}

            title = article.find('h1', class_ = 'entry-title')
            if title:
                news['title'] = title.text.strip()

            description = article.find('article', class_='status-publish')
            if description:
                news['description'] = description.text.strip()

            pub_date = article.find('time')
            if pub_date:
                news['pubDate'] = pub_date.get('datetime')


            news['link'] = article_url


            category = article.find_all('a', class_='block text-blue-500 text-xs font-extrabold uppercase')
            if category:
                tags =[]
                for tag in category :
                    tags.append(tag.text.strip())
                news['category'] = tags

            img_thum_div = article.find("div", class_="main-image")
            img_thum = img_thum_div.find("img")["src"] if img_thum_div else None

            if img_thum:
                news['thImage'] = img_thum

            creator = article.find('span', class_='author')
            if creator:
                news['creator'] = creator.text.strip()


            article_data = {
                "title": news['title'],
                "url": news['link'],
                "author":  news['creator'],
                "publish_date": news['pubDate'],
                "tags": news['category'],
                'img_thum': news['thImage'],
                # 'images': imgs,
                "content": news['description'],
                # "scraped_date":scraped_date
            }

            return article_data
        else:
            return None

    def scrape_news(self):

        url = self.base_url
        soup = self.load_page(url)
        if soup:
            articles = soup.find_all("div", class_="py-5")
            print(f'Number of articles found : {len(articles)}')
            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.base_url + link
                article_data = self.get_article_body(article_url)
                if article_data:
                    self.collection.insert_one(article_data)
                    print(f"Saved to MongoDB: {article_data['title']}")


#########################################################################################################################################################



class NewsbtcScraper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://www.newsbtc.com/'):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url


    def convert_relative_time(self, relative_time_str):
        """Convert relative time like '13 hours ago' to a UNIX timestamp."""
        current_time = datetime.now()

        # Extract the number and time unit
        parts = relative_time_str.split()
        number = int(parts[0])
        unit = parts[1].lower()  # "hours", "minutes", "days", etc.

        # Calculate the past time based on the unit
        if "hour" in unit:
            past_time = current_time - timedelta(hours=number)
        elif "minute" in unit:
            past_time = current_time - timedelta(minutes=number)
        elif "day" in unit:
            past_time = current_time - timedelta(days=number)
        elif "week" in unit:
            past_time = current_time - timedelta(weeks=number)
        else:
            past_time = current_time

        return int(past_time.timestamp())

    def get_article_body(self, article_url):
        article = self.load_page(article_url)
        if article:

            news = {}

            title = article.find('h1', class_ = 'jeg_post_title')
            if title:
                news['title'] = title.text.strip()

            description = article.find('div', class_='jeg_main_content')
            if description:

                description_paragraphs = description.find_all(['p', 'h2'])
                description_text = ''.join([para.text for para in description_paragraphs])
                news['description'] = description_text


            pub_date = article.find('div' , class_ = 'jeg_meta_date')
            if pub_date:
                date_str = pub_date.get('datetime', pub_date.text.strip())

                # Handle relative time like "13 hours ago"
                if "ago" in date_str:
                    news['pubDate'] = self.convert_relative_time(date_str)
                else:
                    # Convert absolute date string to a UNIX timestamp
                    date_format = '%Y-%m-%dT%H:%M:%S%z'  # Adjust format as needed
                    date_obj = datetime.strptime(date_str, date_format)
                    news['pubDate'] = int(date_obj.timestamp())


            news['link'] = article_url


            news['category'] = article_url.split('/')[-3]
            # if category:
            #     tags =[]
            #     for tag in category :
            #         tags.append(tag.text.strip())
            #     news['category'] = tags

            img_thum_div = article.find("figure", class_="block-article__figure")
            img_thum = img_thum_div.find("img")["src"] if img_thum_div else None


            imgs = [img["src"] for img in description.find_all("img")]

            if img_thum:
                news['thImage'] = img_thum

            creator = article.find('div', class_='jeg_meta_author__name')
            if creator:
                news['creator'] = creator.text.strip()




            article_data = {
                "title": news['title'],
                "url": news['link'],
                "author":  news['creator'],
                "publish_date": news['pubDate'],
                "tags": news['category'],
                'img_thum': news['thImage'],
                'images': imgs,
                "content": news['description'],
                # "scraped_date":scraped_date
            }

            return article_data
        else:
            return None

    def scrape_news(self):

        url = self.base_url
        soup = self.load_page(url)
        if soup:
            articles = soup.find_all("article", class_="block-article")
            print(f'Number of articles found : {len(articles)}')
            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.base_url + link
                article_data = self.get_article_body(article_url)
                if article_data:
                    self.collection.insert_one(article_data)
                    print(f"Saved to MongoDB: {article_data['title']}")


#########################################################################################################################################################

# scraper = NewsbtcScraper(
#     mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
#     db_name="crypto_news",
#     collection_name="newsbtc"
# )
# scraper.scrape_news()




None




None




None




None


KeyboardInterrupt: 

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime


class DailyhodlScrapper(Scraper):

    def __init__(self):
        super().__init__('https://dailyhodl.com/news/')
        self.rss_url = "https://dailyhodl.com/feed/"
        self.unwanted_text = (
            "Check Price Action",
            "Follow us on X, Facebook and Telegram\n\n",
            "Surf The Daily Hodl Mix",
            "Disclaimer: Opinions expressed at The Daily Hodl are not investment advice. Investors should do their due diligence before making any high-risk investments in Bitcoin, cryptocurrency or digital assets. Please be advised that your transfers and trades are at your own risk, and any losses you may incur are your responsibility. The Daily Hodl does not recommend the buying or selling of any cryptocurrencies or digital assets, nor is The Daily Hodl an investment advisor. Please note that The Daily Hodl participates in affiliate marketing.",

        )

    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content



    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None


    def getnews(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = ' '.join(p.get_text() for p in soup_d.find_all(['p', 'h2']))
            content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')


            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()


            news_items.append(news)

        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'Dailyhodl'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:
          rss_content = self.fetch_rss()
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



NameError: name 'Scraper' is not defined

## heeellllllllpppppp

<div class="single-post-main-middle">
<div class="ambcr-before-content_2" id="ambcr-102562872"><style></style>
<div id="bsa-zone_1708432009695-0_123456"></div>
<p></p>
</div><ul>
<li><em><span style="font-weight: 400;">SOL’s trading volume has increased by 20%, indicating higher participation from traders.</span></em></li>
<li style="font-weight: 400;"><em><span style="font-weight: 400;">SOL’s Open Interest has increased by 4% in the last 24 hours as well, signaling growing interest from investors.</span></em></li>
</ul>
<p><span style="font-weight: 400;">In this highly volatile cryptocurrency market, two whales have purchased and staked a significant amount of <a data-wpel-link="internal" href="https://ambcrypto.com/category/solana-news/" rel="noopener" target="_blank">Solana [SOL]</a> from <a data-wpel-link="internal" href="https://ambcrypto.com/category/binance-news/" rel="noopener" target="_blank">Binance [BNB]</a>. </span></p>
<p><span style="font-weight: 400;">On the 22nd of Augu

In [None]:
import requests

url = 'https://cryptobriefing.com/bitcoin-etf-resilience-recovery/'
response = requests.get(url)
print(response.content)




## complete codes

### bitcoinist

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime



class bitcoinistScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://bitcoinist.com' , rss_url = "https://bitcoinist.com/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = ()

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)
        print(root)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            # news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()




            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = bitcoinistScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="bitcoinist"
)
scraper.scrape_news()


NameError: name 'BaseScraper' is not defined

### coinpedia

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime



class coinpediaScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://bitcoinist.com' , rss_url = "https://coinpedia.org/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = (
            "Check Price Action",
            "Follow us on X, Facebook and Telegram\n\n",
            "Surf The Daily Hodl Mix",
            "Disclaimer: Opinions expressed at The Daily Hodl are not investment advice. Investors should do their due diligence before making any high-risk investments in Bitcoin, cryptocurrency or digital assets. Please be advised that your transfers and trades are at your own risk, and any losses you may incur are your responsibility. The Daily Hodl does not recommend the buying or selling of any cryptocurrencies or digital assets, nor is The Daily Hodl an investment advisor. Please note that The Daily Hodl participates in affiliate marketing.",

        )

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)
        print(root)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            # news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()




            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = coinpediaScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="coinpedia"
)
scraper.scrape_news()


### cryptobriefing

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import base64


class cryptobriefingScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://dailyhodl.com' , rss_url = "https://cryptobriefing.com/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = ()

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            # content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            # soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            # content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            # content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            # news['content'] =  content_text.strip()






            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()






            response = requests.get(news['link'])
            article_soup = BeautifulSoup(response.content, 'html.parser')
            article = article_soup.find('section' , class_ = 'article-content')
            thumb_nail = article_soup.find('section' , class_ = 'article-thumbnail')
            news['thImage'] = thumb_nail.find('img')['src']

            if article:

                description_paragraphs = article.find_all(['p', 'h2'])
                description_text = ''.join([para.text for para in description_paragraphs])
                news['content'] = description_text

                news['imges'] = [img["src"] for img in article.find_all("img")]



            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = cryptobriefingScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="cryptobriefing"
)
scraper.scrape_news()


### ambcrypto

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime



class ambcryptoScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://dailyhodl.com' , rss_url = "https://ambcrypto.com/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = (
            "Check Price Action",
            "Follow us on X, Facebook and Telegram\n\n",
            "Surf The Daily Hodl Mix",
            "Disclaimer: Opinions expressed at The Daily Hodl are not investment advice. Investors should do their due diligence before making any high-risk investments in Bitcoin, cryptocurrency or digital assets. Please be advised that your transfers and trades are at your own risk, and any losses you may incur are your responsibility. The Daily Hodl does not recommend the buying or selling of any cryptocurrencies or digital assets, nor is The Daily Hodl an investment advisor. Please note that The Daily Hodl participates in affiliate marketing.",

        )

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            # content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            # soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            # content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            # content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            # news['content'] =  content_text.strip()


            # images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            # news['images'] = images


            # news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            response = requests.get(news['link'])

            article_soup = BeautifulSoup(response.content, 'html.parser')
            article = article_soup.find('div' , class_ = 'single-post-main-middle')


            if article:
                description_paragraphs = article.find_all(['p', 'h2'])
                description_text = ''.join([para.text for para in description_paragraphs])
                news['content'] = description_text
                news['imgs'] = [img["src"] for img in article.find_all("img" , decoding = 'async')]
            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = ambcryptoScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="ambcrypto"
)
scraper.scrape_news()


### zycripto

In [10]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime



class ZycryptoScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://zycrypto.com/' , rss_url = "https://zycrypto.com/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = ['Disclaimer: This is a sponsored article, and views in it do not represent those of, nor should they be attributed to, ZyCrypto. Readers should conduct independent research before taking any actions related to the company, product, or project mentioned in this piece; nor can this article be regarded as investment advice. Please be aware that trading cryptocurrencies involve substantial risk as the volatility of the crypto market can lead to significant losses.',
                              'Join BlockDAG Presale Now:', 'Website: https://blockdag.network' , 'Presale: https://purchase.blockdag.network' , 'Telegram: https://t.me/blockDAGnetworkOfficial' ,
                               'Discord: https://discord.gg/Q7BxghMVyu', 'Learn more:' , 'Buy Presale' , 'Visit DTX Website' ,'Join The DTX Community'
        ]

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = '\n\n'.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            # news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = ZycryptoScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="Zycrypto"
)
scraper.scrape_news()


number of news fetched : 14
Saved to MongoDB: Analyst Predicts Bitcoin Rally to $86,000 If This Happens
Saved to MongoDB: Stablecoin Market Cap Hits New All-Time High, Setting Stage For Potential Astronomical Bitcoin Rally
Saved to MongoDB: Orderly Network’s Reveals Its Upcoming $ORDER Token Launch: A Step Closer To Changing The Defi Experience
Saved to MongoDB: Shiba Inu’s Lead Developer Showcases Why SHIB Ecosystem Security Is A Top Priority
Saved to MongoDB: Bitcoin Surge To $70,000 Looms as Price Breaches Long Consolidation
Saved to MongoDB: Transaction Failures for Ethereum’s L2s Surge Despite Dencun Upgrade
Saved to MongoDB: XRP $16 All-Time High In Play As Coin Prints Largest Bull Flag In History
Saved to MongoDB: PrimeXBT: Crypto Trading with Leverage up to x200
Saved to MongoDB: Prosperous Ethereum Trader Now Top Buyer in Presale of BNB-Chain’s Bitcoin-Alternative
Saved to MongoDB: SEC Reportedly Halts Solana ETF Approval Amid Ongoing Security Classification Debate
Saved to Mo

### beincrypto

In [24]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime



class BeincryptoScrapper(BaseScraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://beincrypto.com/' , rss_url = "https://beincrypto.com/news/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.rss_url = rss_url
        self.unwanted_text = ['Disclaimer: This is a sponsored article, and views in it do not represent those of, nor should they be attributed to, ZyCrypto. Readers should conduct independent research before taking any actions related to the company, product, or project mentioned in this piece; nor can this article be regarded as investment advice. Please be aware that trading cryptocurrencies involve substantial risk as the volatility of the crypto market can lead to significant losses.',
                              'Join BlockDAG Presale Now:', 'Website: https://blockdag.network' , 'Presale: https://purchase.blockdag.network' , 'Telegram: https://t.me/blockDAGnetworkOfficial' ,
                               'Discord: https://discord.gg/Q7BxghMVyu', 'Learn more:' , 'Buy Presale' , 'Visit DTX Website' ,'Join The DTX Community'
        ]

    def clean_content(self, content):
      for text in self.unwanted_text:
        if text in content:
          content = content.replace(text, "").strip()

      return content

    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url ,headers=self.headers)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None



    def parse_rss(self, rss_content):

        news_items = []

        root = ET.fromstring(rss_content)

        namespaces = {'media': 'http://search.yahoo.com/mrss/'}
        for item in root.findall(".//item"):

            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = '\n\n'.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            # news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            thumbnail = item.find('media:thumbnail', namespaces)
            news['thImage'] = thumbnail.get('url') if thumbnail is not None else None


            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            news_items.append(news)

        return news_items




    def scrape_news(self):
        rss_content = self.fetch_rss()
        if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            for news in news_items:
              if news:
                  self.collection.insert_one(news)
                  print(f"Saved to MongoDB: {news['title']}")






scraper = BeincryptoScrapper(
    mongo_uri="mongodb+srv://pouya:p44751sm@cluster0.hoskl3b.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
    db_name="crypto_news",
    collection_name="Beincrypto"
)
scraper.scrape_news()


<Element 'item' at 0x79613c5eaf70>
<Element 'item' at 0x79613c689da0>
<Element 'item' at 0x79613c614db0>
<Element 'item' at 0x79613c616a70>
<Element 'item' at 0x79613c45ba60>
<Element 'item' at 0x79613c45b8d0>
<Element 'item' at 0x79613c458ae0>
<Element 'item' at 0x79613c458450>
<Element 'item' at 0x79613c4599e0>
<Element 'item' at 0x79613c45a610>
<Element 'item' at 0x79613c45a250>
<Element 'item' at 0x79613c459cb0>
number of news fetched : 12
Saved to MongoDB: PEPE Price Poised for Gains as Short-Term Holdings Hit Lowest Point
Saved to MongoDB: Study Reveals Crypto Investors Prone to Psychopathy and Chaos
Saved to MongoDB: 3 AI Coins to Watch Ahead of NVIDIA Q2 Earnings
Saved to MongoDB: UAE Investment Firms Hodler and Gewan Launch $500 Million Fund to Accelerate DePIN and AI
Saved to MongoDB: Cardano (ADA) Price Targets $0.39 as Key Trendline Break Looms
Saved to MongoDB: 3 Factors Justin Sun Considers Before Buying Meme Coins
Saved to MongoDB: New Investors Pull Back as Notcoin (NOT

## finals

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from bs4 import BeautifulSoup
import html
from html import unescape

class AmbcryptoScrapper(Scraper):

    def __init__(self):
        super().__init__('https://ambcrypto.com/')
        self.rss_url = "https://ambcrypto.com/feed/"
        self.unwanted_text = ()

    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content



    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None


    def parse_rss(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')




            description_text = soup_d.get_text()

            news['summery'] = description_text.strip()





            news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            response = requests.get(news['link'])

            article_soup = BeautifulSoup(response.content, 'html.parser')
            article = article_soup.find('div' , class_ = 'single-post-main-middle')


            if article:
                description_paragraphs = article.find_all(['p', 'h2'])
                description_text = ''.join([para.text for para in description_paragraphs])
                news['content'] = description_text
                news['images'] = [img["src"] for img in article.find_all("img" , decoding = 'async')]
            news_items.append(news)

        return news_items



    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'ambcrypto'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:
          rss_content = self.fetch_rss()
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from bs4 import BeautifulSoup


class CryptobriefingScrapper(Scraper):

    def __init__(self):
        super().__init__('https://cryptobriefing.com/')
        self.rss_url = "https://cryptobriefing.com/feed/"
        self.unwanted_text = ()

    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content



    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None


    def parse_rss(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            # content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            # soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            # content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            # content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            # news['content'] =  content_text.strip()






            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()






            response = requests.get(news['link'])
            article_soup = BeautifulSoup(response.content, 'html.parser')
            article = article_soup.find('section' , class_ = 'article-content')
            thumb_nail = article_soup.find('section' , class_ = 'article-thumbnail')
            news['thImage'] = thumb_nail.find('img')['src']

            if article:

                description_paragraphs = article.find_all(['p', 'h2'])
                description_text = ''.join([para.text for para in description_paragraphs])
                news['content'] = description_text

                news['images'] = [img["src"] for img in article.find_all("img")]



            news_items.append(news)

        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'cryptobriefing'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:
          rss_content = self.fetch_rss()
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from bs4 import BeautifulSoup


class CoinpediaScrapper(Scraper):

    def __init__(self):
        super().__init__('https://coinpedia.com/')
        self.rss_url = "https://coinpedia.com/feed/"
        self.unwanted_text = ()

    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content



    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None


    def getnews(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = ' '.join(p.get_text() for p in soup_d.find_all(['p', 'h2']))
            content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')


            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()


            news_items.append(news)

        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'coinpedia'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:
          rss_content = self.fetch_rss()
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from bs4 import BeautifulSoup


class BitcoinistScrapper(Scraper):

    def __init__(self):
        super().__init__('https://bitcoinist.com/')
        self.rss_url = "https://bitcoinist.com/feed/"
        self.unwanted_text = ()

    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content



    def fetch_rss(self):
        try:
            response = requests.get(self.rss_url)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            print(f"Error fetching RSS feed: {e}")
            return None


    def getnews(self, rss_content):
        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = ' '.join(p.get_text() for p in soup_d.find_all(['p', 'h2']))
            content_text = ' '.join(p.get_text() for p in soup_c.find_all(['p', 'h2'])[:-3])
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')


            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()


            news_items.append(news)

        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [category.lower()]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'bitcoinist'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] =  newsItem.get('category', '')


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
            raise errors.DataProvidingException(f'{str(err)} from SpecificSiteScraper')


    def start_scraping(self):

        try:
          rss_content = self.fetch_rss()
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info('Crawling of SpecificSite Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')
        except Exception as err:
            logger.error(f'{str(err)} from SpecificSiteScraper')



<Element 'rss' at 0x7a4536a28ae0>
number of news fetched : 10
Saved to MongoDB: Top Analyst Says Bitcoin Avoided Major Breakdown Despite Liquidations; Outlines Path to $50K
Saved to MongoDB: Massive $55M Loss: How a Simple Click Cost a Crypto Whale Everything!
Saved to MongoDB: Top Layer-2 (L2) Altcoins To Stack Before $ETH Reclaims $3K! 
Saved to MongoDB: Breakout Alert! MATIC Price Prediction for August 21
Saved to MongoDB: Altcoin Season Under Threat? Expert Shared His Price Perspective on Key Altcoins
Saved to MongoDB: Terraform Labs Sparks LUNA and LUNC Price Rally with Key Hearing Date
Saved to MongoDB: 2024 Crypto Forecast: The 3 Altcoins Poised for Explosive Growth This August
Saved to MongoDB: Cardano Loses 10th Place, ADA Price To Make A Comeback With 15% Jump? 
Saved to MongoDB: SunPummp’s Explosive Growth: Tron Meme Coins Set for 100x Surge?
Saved to MongoDB: Will These Tron-based Tokens Record A 100% Upside This Q3?


# base base scraper

In [None]:
import requests
import logging
from utils import  errors
import os
logger = logging.getLogger('Rotating Log')
from datetime import datetime
import json
from NewsAppModel.models import NewsModel



class scraper :
    def __init__(self ,url):
        self.ProviderUrl = url
        self.xmlData = None
        self.newsOBJ = NewsModel()

    def loadPage(self,url):
    # url of rss feed
        try:
            # creating HTTP response object from given url
            headers = {
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
            resp = requests.get(url, timeout=3, headers=headers)
            if resp.status_code == 200:

                self.xmlData = resp.content
                return self.xmlData
            else:
                return -1

        except requests.exceptions.HTTPError as htError:
            logger.error('Http Error: %s', str(htError))
            raise errors.DataProvidingException(''.format('  {message}' ,message=str(htError)))

        except requests.exceptions.ConnectionError as coError:
            logger.error('Connection Error: %s',str( coError))
            raise errors.DataProvidingException(str(coError))

        except requests.exceptions.Timeout as timeOutError:
            logger.error('TimeOut Error: %s', str(timeOutError))
            raise errors.DataProvidingException (str(timeOutError))

        except requests.exceptions.RequestException as ReError:
            logger.error('Something was wrong: %s', str(ReError))
            raise errors.DataProvidingException(str(ReError))


    def saveInMongo(self,item):

        try:

            resp = self.newsOBJ.save_to_DB(item)
            dt = datetime.now()
            logger.info("save in MongoDB done {resp} from source number {date} ".format(resp=resp, date=dt))
            return resp
        except requests.exceptions.ConnectionError as er:
            raise errors.DataProvidingException (''.format('Database error: {message} ', message= str(er)))
        except Exception as err:
            raise errors.DataProvidingException(str(err))
    def JsonItemStandard(newsItem):
        pass



    def checkForExist(self,link,title,author):
        try:
            return self.newsOBJ.find_by_Link(link,title,author)

        except requests.exceptions.ConnectionError as err:
            raise errors.DataProvidingException( str(err))
        except Exception as err:
            raise errors.DataProvidingException(str(err))




## zycrypto

In [None]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from datetime import datetime , timedelta

class ZycryptoScrapper(scraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://zycrypto.com/' , rss_url = "https://zycrypto.com/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.name = 'zycrypto'
        self.rss_url = rss_url
        self.unwanted_text = ['Disclaimer: This is a sponsored article, and views in it do not represent those of, nor should they be attributed to, ZyCrypto. Readers should conduct independent research before taking any actions related to the company, product, or project mentioned in this piece; nor can this article be regarded as investment advice. Please be aware that trading cryptocurrencies involve substantial risk as the volatility of the crypto market can lead to significant losses.',
                              'Join BlockDAG Presale Now:', 'Website: https://blockdag.network' , 'Presale: https://purchase.blockdag.network' , 'Telegram: https://t.me/blockDAGnetworkOfficial' ,
                               'Discord: https://discord.gg/Q7BxghMVyu', 'Learn more:' , 'Buy Presale' , 'Visit DTX Website' ,'Join The DTX Community'
        ]


    def clean_content(self, content):
      if self.unwanted_text in content:
          content = content.replace(self.unwanted_text, "").strip()
      return content






    def parse_rss(self, rss_content):

        news_items = []
        root = ET.fromstring(rss_content)

        for item in root.findall(".//item"):
            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = '\n'.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            content_text = self.clean_content(content_text)
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            # news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            news['thImage'] = soup_d.find('img')['src']

            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            news_items.append(news)

        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [keyword.lower() for keyword in category]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'zycrypto'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] = 'cryptocurrency'


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')


    def start_scraping(self):

        try:
          rss_content = self.loadPage(self.rss_url)
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info(f'Crawling of {self.name} Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')



## cointelegraph

In [5]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime , timedelta

class FinboldScraper(BaseScraper):

    def __init__(self):
        self.url = 'https://cointelegraph.com/'
        self.name = 'cointelegraph'

    def getnews(self):
        newsItem = []
        self.soup = BeautifulSoup(self.soup, 'html.parser')
        if self.soup:

            articles = self.soup.find_all("article", class_="post-card__article rounded-lg")
            print(f'number  of articles found : {len(articles)}')
            for article in articles:
                link = article.find("a")["href"]
                article_url = link if link.startswith("http") else self.ProviderUrl + link
                article = self.loadPage(article_url)
                article = BeautifulSoup(article, "html.parser")

                news = {}

                title = article.title.string
                if title:
                    news['title'] = title.text.strip()


                summery = article.find("meta", attrs={"name": "description"})
                if summery:
                    news['summery'] = summery["content"]

                pub_date = article.find('time')
                if pub_date:
                    news['pubDate'] = pub_date['datetime']
                description = article.find('div', class_='post-content')
                if description:

                    description_paragraphs = description.find_all(['p', 'h2']  , recursive=False)
                    description_text = ''.join([para.text for para in description_paragraphs])
                    news['description'] = description_text

                news['link'] = article_url

                category_ul = article.find('ul', class_='tags-list__list')


                tags_items = category_ul.find_all('li', class_='tags-list__item')


                news['category'] = [item.get_text(strip=True) for item in tags_items]



                img_thum_div = article.find("div", class_="post-cover__image")
                img_thum = img_thum_div.find("img" , )["src"] if img_thum_div else None

                if img_thum:
                    news['thImage'] = img_thum

                news['imgs'] = [img["src"] for img in description.find_all("img", attrs={"pinger-seen": "true"}, recursive=False)]

                creator = article.find('div', class_='post-meta__author-name')
                if creator:
                    news['creator'] = creator.text.strip()

                newsItem.append(news)

        return newsItem

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = newsItem.get('title', '')

            item['articleBody'] = newsItem.get('description', '')


            # pubDate = newsItem.get('pubDate')
            # if pubDate:
            #     currentDate = datetime.strptime(pubDate, '%Y-%m-%dT%H:%M:%S%z')
            #     item['pubDate'] = int(currentDate.timestamp())
            # else:
            #     item['pubDate'] = int(datetime.now().timestamp())


            category = newsItem.get('category', '')
            item['keywords'] = [keyword.lower() for keyword in category]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'cointelegraph'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('imgs', ' ')


            item['category'] = 'cryptocurrencies'


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('creator')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')


    def start_scraping(self):

        try:

                    self.soup = self.loadPage(self.url)
                    now = datetime.now()
                    logger.info(f'Crawling of {self.name} Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
                    logger.info('+---------------------------------------------+')
                    if self.soup:
                        newsItems = self.getnews()
                        self.savegroupNews(newsItems)

        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')



## BeinCrypto

In [None]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from datetime import datetime , timedelta
import html
from html import unescape

class BeinCryptoScrapper(scraper):
    def __init__(self, mongo_uri, db_name, collection_name, base_url ='https://beincrypto.com/' , rss_url = "https://beincrypto.com/news/feed/"):
        super().__init__(mongo_uri, db_name, collection_name)
        self.base_url = base_url
        self.name = 'beincrypto'
        self.rss_url = rss_url







    def parse_rss(self, rss_content):

        news_items = []

        root = ET.fromstring(rss_content)

        namespaces = {'media': 'http://search.yahoo.com/mrss/'}
        for item in root.findall(".//item"):

            news = {}
            news['title'] = item.find('title').text
            news['link'] = item.find('link').text
            news['pubDate'] = item.find('pubDate').text


            pub_date = datetime.strptime(news['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
            news['timestamp'] = int(pub_date.timestamp())


            description_html = item.find('description').text
            soup_d = BeautifulSoup(description_html, 'html.parser')



            content_encoded = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
            soup_c = BeautifulSoup(content_encoded, 'html.parser')


            description_text = soup_d.get_text()
            content_text = '\n\n'.join(p.get_text() for p in soup_c.find_all(['p', 'h2']))
            news['summery'] = description_text.strip()
            news['content'] =  content_text.strip()


            images = [img['src'] for img in soup_c.find_all('img') if 'src' in img.attrs]
            news['images'] = images


            # news['thImage'] = item.find('enclosure')['url'] if item.find('enclosure') else (images[0] if images else '')
            thumbnail = item.find('media:thumbnail', namespaces)
            news['thImage'] = thumbnail.get('url') if thumbnail is not None else None


            categories = [category.text for category in item.findall('category')]
            news['category'] = categories
            news['keywords'] = categories


            author_encoded = item.find('{http://purl.org/dc/elements/1.1/}creator').text
            soup_a = BeautifulSoup( author_encoded, 'html.parser')

            author = soup_a.get_text()
            news['author'] = author.strip()





            news_items.append(news)

        return news_items


        return news_items

    def JsonItemStandard(self, newsItem):
        try:
            item = {}


            item['title'] = unescape(newsItem.get('title', ''))

            item['articleBody'] = unescape(newsItem.get('content', ''))


            item['pubDate']  = newsItem.get('timestamp')


            category = newsItem.get('category', '')
            item['keywords'] = [keyword.lower() for keyword in category]


            item['link'] = newsItem.get('link', '')


            item['provider'] = 'beincrypto'


            item['summary'] = newsItem.get('summery', '')


            item['thImage'] = newsItem.get('thImage', ' ')


            item['images'] = newsItem.get('images', ' ')


            item['category'] = 'cryptocurrency'


            item['Negative'] = 0
            item['Neutral'] = 0
            item['Positive'] = 0


            creator = newsItem.get('author')
            if not creator:
                item['author'] = item['provider']
            else:
                item['author'] = unescape(creator).strip().lower()


            item['scraped_date'] = int(datetime.now().timestamp())

            return item
        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')

    def savegroupNews(self, newsItems):
        try:
            for item in newsItems:

                item = self.JsonItemStandard(item)
                self.saveInMongo(item)
        except requests.exceptions.ConnectionError as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
            raise errors.DataProvidingException(f'{str(err)} from {self.name}Scraper')


    def start_scraping(self):

        try:
          rss_content = self.loadPage(self.rss_url)
          if rss_content:
            news_items = self.parse_rss(rss_content)
            print(f'number of news fetched : {len(news_items)}')
            now = datetime.now()
            logger.info(f'Crawling of {self.name} Started at ' + now.strftime('%a, %d %b %Y %H:%M:%S Z') + '!!')
            logger.info('+---------------------------------------------+')
            self.savegroupNews(news_items)


        except errors.DataProvidingException as err:
            logger.error(f'{str(err)} from {self.name}Scraper')
        except Exception as err:
            logger.error(f'{str(err)} from {self.name}Scraper')

