In [1]:
import requests
import xml.etree.ElementTree as ET

import deepl
from typing import List, Tuple

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
### https://selenium-python.readthedocs.io/waits.html

### deepl setting
auth_key = '2f9ac10f-8d05-4dd1-b715-00f03482f297:fx'
translator = deepl.Translator(auth_key)

    

In [2]:
class NewsArticle:
    
    def __init__(self, title:str, article_url:str, img_url:str, source:str):
        """
        One NewsArticle includes these things
        - title         : headline of the article
        - article_url   : URL to go the news article but it is redirection link prodived by google 
        
        Args:
            title (str): _description_
            article_url (str): _description_
            img_url (str): _description_
            source (str): _description_
        """
        
        self.title          = title
        self.article_url    = article_url
        self.img_url        = img_url
        self.source         = source
        
    def translate(self, target_lang:str = "EN_US"):
        
        return NewsArticle(
            title          =   translator.translate_text(self.title, target_lang=target_lang),
            article_url    =   self.article_url,
            img_url        =   self.img_url,
            source         =   self.source
        )
        

class HeadLine:
    
    def __init__(self, country:str, headline_gnews_rss_url:str):
        """_summary_

        Args:
            country (str): _description_
            headline_gnews_rss_url (str): _description_

        Returns:
            _type_: _description_
        """
        
        ####################################################################
        ### Sharing browser setting for getting news article main image
        ####################################################################
        
        _options = Options()  
        _options.add_argument('--headless')
        _driver = webdriver.Firefox(options=_options)
        
        def getNewsUrls(redirect_url:str) -> Tuple[str, str]:
            """
            This is a function only used in this constructor
            Args:
                article_url (str): url of news article. it has redirection form of google news url

            Returns:
                str: img url of the news article
            """
            _driver.get(url = redirect_url)
            WebDriverWait(_driver, 5).until_not(EC.url_contains('google.com'))

            article_url = _driver.current_url
            element = _driver.find_element(By.XPATH, "//meta[@property='og:image']")
            
            return article_url, element.get_attribute('content')
        
        
        ####################################################################
        ### Using RSS request(XML parsing), save articles information
        ####################################################################
        
        
        
        self.articles: List[NewsArticle] = []
        """
        
        """
        
        rss_req = requests.get(headline_gnews_rss_url)
        xml_root = ET.fromstring(rss_req.text)
        self.last_build_date: str = xml_root.find('channel').find('lastBuildDate').text
        
        for item in xml_root.find('channel').findall('item'):
            
            article_url     = item.find('link').text
            img_url         = "https://via.placeholder.com/150"
            try:
                article_url, img_url = getNewsUrls(article_url)
                print('okayyyyyy')
            except:
                print("exception occurred")
            
            self.articles.append(
                NewsArticle(
                    title           = item.find('title').text,
                    article_url     = article_url,
                    img_url         = img_url,
                    source          = item.find('source').text
                )
            )
            
        _driver.quit()
    
    def getNewsArticles(self) -> List[NewsArticle] :
        """
        Returns news articles
        
        Returns:
            List[NewsArticle]: _description_
        """
        
        return self.articles
            
        

In [3]:
headline_kr = HeadLine('kr', 'https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZxYUdjU0FtdHZHZ0pMVWlnQVAB?ceid=KR:ko&oc=3')

okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred
okayyyyyy
exception occurred
okayyyyyy
okayyyyyy
okayyyyyy
exception occurred


In [4]:
headline_kr.articles[6].article_url

'https://news.google.com/rss/articles/CBMie0FVX3lxTE5fWldyR0NTdFpYeXJhR1dTTVREckJRTFAzc2tOaFRBUW5xUlFVUjk1Y1QtNGNiUkFjNmE3RDZxZmNKVWdOYndIeWNNZWpEbFpfR2xaWHlNY0VMRVpPQWEzcEJtbUEzWmo1cTlnMkgzTVM1RUI5M2JlX3c0WdIBV0FVX3lxTE9QbFB0VGZEZkJmbDlDNnpsaVFWd0lXc1B2VkpaZnh1ZG1STk9ZNzBBYnhwRjJERkx2QUd6M0FWUm1qeFEtaC1ma2tBV0dDMlJzQWcxMTEwQQ?oc=5'

In [5]:
headline_kr.articles[6].redirect_url

'https://news.google.com/rss/articles/CBMie0FVX3lxTE5fWldyR0NTdFpYeXJhR1dTTVREckJRTFAzc2tOaFRBUW5xUlFVUjk1Y1QtNGNiUkFjNmE3RDZxZmNKVWdOYndIeWNNZWpEbFpfR2xaWHlNY0VMRVpPQWEzcEJtbUEzWmo1cTlnMkgzTVM1RUI5M2JlX3c0WdIBV0FVX3lxTE9QbFB0VGZEZkJmbDlDNnpsaVFWd0lXc1B2VkpaZnh1ZG1STk9ZNzBBYnhwRjJERkx2QUd6M0FWUm1qeFEtaC1ma2tBV0dDMlJzQWcxMTEwQQ?oc=5'