# Reuters Data fetch and save

This notebook downloads all the data from the reuters about a specific topic and saves it to the database

In [1]:
from ipywidgets import IntProgress
from IPython.display import display
from bs4 import BeautifulSoup
import requests
import math
import ray
import asyncio
import aiohttp
import os.path
from time import sleep
from dateutil import parser
import asyncio
from pyppeteer import launch

In [3]:
import duckdb

class NewsMetaRepository:
    def __init__(self, csv_file=None):
        self.connection = duckdb.connect(database=':memory:', read_only=False)
        if os.path.isfile(f'{csv_file}/news_meta.csv'):
            try:
                self.connection.execute(f"IMPORT DATABASE '{csv_file}';")
            except Exception as e:
                self._create_table()
                self.connection.execute(f"EXPORT DATABASE '{csv_file}';")
        else:
            self._create_table()

    def _create_table(self):
        self.connection.execute("CREATE TABLE news_meta (id VARCHAR, title VARCHAR UNIQUE, url VARCHAR, timestamp VARCHAR UNIQUE,term VARCHAR)")
        self.connection.execute("CREATE SEQUENCE id_sequence START 1 INCREMENT BY 1;")

    def insert(self, news_meta):
        self.connection.execute("PREPARE insert_meta AS "
                                "INSERT INTO news_meta VALUES (nextval('id_sequence'), ?, ?, ?, ?) ON CONFLICT DO NOTHING;")
        self.connection.execute(f"EXECUTE insert_meta('{news_meta['title']}', '{news_meta['url']}', '{news_meta['timestamp']}', '{news_meta['term']}');")

    def select_all(self):
        return self.connection.execute("SELECT * FROM news_meta").fetchdf()

    def select_by_id(self, id):
        return self.connection.execute("SELECT * FROM news_meta WHERE id = ?", id).fetchdf()

    def select_by_title(self, title):
        return self.connection.execute("SELECT * FROM news_meta WHERE title = ?", title).fetchdf()

    def select_by_url(self, url):
        return self.connection.execute("SELECT * FROM news_meta WHERE url = ?", url).fetchdf()

    def select_by_term(self, source):
        self.connection.execute("PREPARE select_by_source AS "
                                "SELECT * FROM news_meta WHERE term = ?")
        return self.connection.execute(f"EXECUTE select_by_source('{source}');").fetchdf()

    def select_by_date(self, date_from, date_to):

        return self.connection.execute(f"SELECT * FROM news_meta WHERE strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def select_by_date_and_term(self, date_from, date_to, term):
        print(date_from, date_to, term)
        return self.connection.execute(f"SELECT * FROM news_meta WHERE term = '{term}' AND strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def delete_all(self):
        self.connection.execute("DELETE FROM news_meta")

    def export(self, csv_file):
        self.connection.execute(f"EXPORT DATABASE '{csv_file}';")

    def close(self):
        self.connection.close()

In [4]:
MAX_PAGE_SIZE = 50
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'
PAGES_REQUESTS_SIZE = 5    # the amount of request ran in parallel to get pages faster, 10 pages loaded at the same time

In [5]:

'''
This functions queries reuters and gets the first few articles
This query contains the 
'''

num_cpus = 4
    
ray.init(ignore_reinit_error=True, num_cpus=num_cpus)

newsMetaRepo = NewsMetaRepository("meta/reuters")

def get_query_param(keyword, offset):    
    return {
        "query": '{"keyword":"%s","offset":%d,"orderby":"display_date:desc","size":%d,"website":"reuters"}' % (keyword, offset, MAX_PAGE_SIZE),
        "d": 201,
        "_website": "reuters"
    }

def insert_article_list_to_db(article_list, term):
    requests_keys = ["id","canonical_url","title","published_time"]

    for query_articles in article_list:
        request_values = {key: query_articles[key] for key in requests_keys}
        
        news_meta = {
            'title': request_values['title'].replace("'","''"),
            'term': term,
            'timestamp': request_values['published_time'],
            'url': request_values['canonical_url']
        }
        newsMetaRepo.insert(news_meta)

# resources = {"requests_resource": 1}, num_cpus=num_cpus
@ray.remote
def try_request(url, headers, params):
    response_page = requests.request("GET", url, headers=headers, params=params, timeout=30)
    try:
        json_response = response_page.json()
                
        if response_page.status_code > 300:
            print("Status code error: " + str(response_page.status_code))
            return False, params

        return True, json_response
    except requests.exceptions.JSONDecodeError as e:
        print(f"Bad Request: GET {url} \n Status Code: {response_page.status_code} | Error : {e}")
        return False, params
    except requests.exceptions.Timeout:
        print("Timed out")
        return False, params
    except requests.exceptions.RequestException as e:
        print(f"Bad Request: GET {url} \n")
        return False, params

def get_news_articles_list(keyword):

    headers = {
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126"',
        'DNT': '1',
        'sec-ch-ua-mobile': '?0',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'sec-ch-ua-arch': '"arm"',
        'sec-ch-device-memory': '8',
        'Referer': 'https://www.reuters.com/site-search/?query=Tesla&offset=0',
        'sec-ch-ua-full-version-list': '"Not/A)Brand";v="8.0.0.0", "Chromium";v="126.0.6478.57"',
        'sec-ch-ua-model': '""',
        'sec-ch-ua-platform': '"macOS"',
        'Cookie': 'datadome=5DoGfdp_v7Y64hF3yeg8zmFRJvUZ_kY9APCBHhhY8Q5HnPWurio8XinFbT~guJr5RlIj0N41S2aQQNBEiMf05rg2Mb3BBC9zMM7Akw1aS8O9285Y~CC1YlyPQoYKfgLi; reuters-geo={"country":"AT", "region":"-"}'
    }
   
    pages_range = None
    last_offset = 0
    f = IntProgress(min=0, max=10) # instantiate the bar\n",
    display(f) # display the bar
    
    hasResults = True
    page_idx = 0
    
    while hasResults:

        tasks = []
        for _ in range(PAGES_REQUESTS_SIZE):
            offset = MAX_PAGE_SIZE + last_offset
            tasks.append(
                try_request.remote(
                    REUTERS_QUERY_URL, 
                    params = get_query_param(keyword=keyword, offset=offset), 
                    headers = headers)
            )
            last_offset = offset
        
        results_list = ray.get(tasks)
        
        for (request_ok, json_parsed) in results_list:
            if request_ok:
                if pages_range is None:
                    query_result_total_size = json_parsed["result"]["pagination"]["total_size"]
                    total_page_size = math.ceil(query_result_total_size / MAX_PAGE_SIZE)
                    pages_range = math.ceil(total_page_size / PAGES_REQUESTS_SIZE)
                    f.max = pages_range
                
                try:
                    # print(len(json_parsed["result"]["articles"]))
                    if(json_parsed["result"] is not None):
                        insert_article_list_to_db(json_parsed["result"]["articles"], keyword)
                    else:
                        hasResults = False
                except Exception as e:
                    hasResults = False
                    print(f"Error getting articles! {str(json_parsed)}")
                    pass
            else:
                print(f"Error at task {last_offset}")
            sleep(0.5)

        print("\rLoading page %d / %d" % (page_idx + 1, pages_range if pages_range is not None else 0), end="")

        page_idx += 1
        f.value = page_idx
        newsMetaRepo.export("meta/reuters")
        sleep(1)

# get_news_articles_list("Tesla")

2024-06-21 17:28:41,074	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


In [6]:
async def download_to_database(news_meta_path, news_meta_id, filename):
    
    if not os.path.isfile(f'articles/reuters/{filename}'):
        url = f"https://www.reuters.com{news_meta_path}"
        
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',                                                                                                               
            'accept-language': 'en-GB,en;q=0.9',
            'cache-control': 'max-age=0',
            'Cookie': '_ga_WBSR7WLTGD=GS1.1.1718979349.1.1.1718979470.2.0.0; usprivacy=1---; RT="z=1&dm=reuters.com&si=3oul8ffj52b&ss=lxorwub5&sl=0&tt=0"; cleared-onetrust-cookies=Thu, 17 Feb 2022 19:17:07 GMT; _awl=2.1718979339.5-685e27cf11309595700479c50c4ae899-6763652d6575726f70652d7765737431-0; _ga=GA1.2.1619945513.1718979340; _gid=GA1.2.1048843721.1718979340; ajs_anonymous_id=317bfb4e-7e16-4e75-bd92-4c4b47428ac0; OptanonAlertBoxClosed=2024-06-21T14:15:40.868Z; eupubconsent-v2=CQAjn_AQAjn_AAcABBENA6EsAP_gAEPgACiQKIpB9G7eCSFDYHp3IJsEMIUH4Vho4sAgBhCBA4AAyBoQJIwGh2AwIACIICACGAIAIEYBIABgEAAAQEAAYIAAIABIAECEAQACIAAAAAIBAAAICAAoAAAAAAAAgFJEEhYBmAEAQBIAQNgAAgABATAQACAAAAAQAAAABAAEQAAAAwAAACAAkABAAAAAAAAAABkIAQBAAAAAQAAAABAAAAECAAAAAAIKCABkGjUCAFEQEhBIAEEAAEQQBAAQIAAAACBAgAASBgQJAwCFWAgAAAIAAAAAAAAIAABAAAIAAgAAEAAQIAAIAAIAAAAAAAAAAAAAAABAAAAAAAIAAAAAAAAAAJEAAQBgQBAABAAQFAAABAgABAAACABAAgQAAAAAAAAAAAAACAggAAAgAAAAAAAAAAAAAAIAABAAAAAAAAAAAABAAACAAAAQCAoAMADYAJwCEBAAgAOgCIAH_AakPABAAbABOOACAA6AIgAf8BqRIAIAA4AiAB_wGpEwAIAJygAYABwAOgCEAEQARqA1IqABABOA.f_wACHwAAAAA; OTAdditionalConsentString=1~70.89.196.311.385.407.413.445.523.584.1097.1421.1638.1703.1725.1859.2068.2072.2074.2213.2282.2309.2328.2337.2416.2418.2486.2567.2568.2571.2577.2628.2813.2822.2869.2909.2999.3000.3059.3100.3253.3331.9731.14332.15731.16931.27731.27831.28731.28831.29631.31631; OptanonConsent=isGpcEnabled=0&datestamp=Fri+Jun+21+2024+16%3A15%3A40+GMT%2B0200+(Central+European+Summer+Time)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=33f46668-8d0b-4bbd-8a04-90c69e88b8b1&interactionCount=1&landingPath=NotLandingPage&groups=1%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2CV2STACK42%3A1; OneTrustWPCCPAGoogleOptOut=false; _gcl_au=1.1.1076428589.1718979342; permutive-id=37216e60-e9c9-41fc-995a-931fa7cda6d7; _scor_uid=fde365cfe44e41e1aed6c5d58be8ca23; _fbp=fb.1.1718979341841.66348021424789325; cookie=532c92d4-794a-41ba-b048-20a563775dc8; cookie_cst=4iwYLMgsTg%3D%3D; _lr_retry_request=true; _lr_env_src_ats=false; datadome=flRGV6V62rhDB6ibnCYnUnighMvhSZzcrS2jYhOLPsPSpepGIsTqY9YAVlYOdDVjBuIrpRZtrDtlP8EB2bowb_1anVbz0IMEsrKo2eFEy1P22AatRrjR44MyH3~DKOd_; __gads=ID=0c9c2b4c571a86ee:T=1718979352:RT=1718979352:S=ALNI_Mac_I_h4eH_DoNnUyu0JYEyS40nFg; __gpi=UID=00000e5f692313bc:T=1718979352:RT=1718979352:S=ALNI_MZcOfNKuHpNG0uzDo98pl9Bvkr5yQ; __eoi=ID=e291a31021f45927:T=1718979352:RT=1718979352:S=AA-AfjYcXJDHmvsm1n02b-mZC9N4; _au_1d=AU1D-0100-001718979354-8SMTV223-1SWW; _au_last_seen_iab_tcf=1718979354944; __qca=P0-364845901-1718979349757; _cc_id=743a0e863e36efa8c1a2edcb03eb70be; panoramaId_expiry=1719065754793; cto_bundle=VK8eW19NejNjcHl6MFRLSndUTTJoQkxrT08lMkZHRUJ0MUxCSVZBYWFkd1FlTEtmQVFRUVNpOXNRZEI1RlBCVTdVcmJ2Q0JscEF3aGVpNHpNOUpodE0zbFFsV1k0VWppY1RKUGdCcnFOUDNqd2luZmVmYWV2U1glMkZUZyUyRnR0ZTlVJTJGbXdjMkNW; cnx_userId=7cb41218569042f6a35b7e6e7b53f267; _gat=1; reuters-geo={"country":"-", "region":"-"}; _dd_s=rum=0&expire=1718980368091; datadome=Lyx4XFa9B5O7fxiFlEerx_X45~EsqiBWozyDnINV11oywjrE0D~yBHbooqIyD6l1uVSqo~FJ4~vfBfhDjfQxiA7yhGcoQ_ndauuE235uDPBBaJp~l7jG5~nIsmloxpyf; reuters-geo={"country":"AT", "region":"-"}',
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
        }
        
        async with aiohttp.ClientSession(headers=headers) as session:
            print(f'Downloading {news_meta_id} , {url}')

            async with session.get(url) as response:
                try:
                    response_text = await response.text()
    
                    # get content in article
                    index_article_start = response_text.index("<article")
                    index_article_end = response_text.index("</article>")
                    article_text_html =  response_text[index_article_start:index_article_end+10]
    
                    bs = BeautifulSoup(article_text_html, "xml")
    
                    full_text = ""
    
                    for EachPart in bs.select('div[class*="article-body__content__"]'):
                        for paragraph in EachPart.parent.select('div[data-testid*="paragraph-"]'):
                            full_text += " " + paragraph.text
    
                    full_text = full_text.replace("\n","")
    
                    with open(f'articles/reuters/{filename}.txt', 'w') as f:
                        f.write(full_text)
                except Exception as e:
                    print(f"Error downloading article {news_meta_id} | {response.status}")

async def load_article(search_term):
    # for meta in meta_list:
    
    news_meta_list = newsMetaRepo.select_by_term(search_term)

    def get_file_name(id):
        title = news_meta_list['title'][id].replace(' ','-').replace("/","-")
        return f"{title}-{parser.parse(news_meta_list['timestamp'][id]).timestamp()}"
    
    for i in range(0,len(news_meta_list)):
        # response = requests.request("GET", url, headers=headers, data=payload)
        try:
            await download_to_database(
                news_meta_list['url'][i],
                news_meta_list['id'][i],
                get_file_name(i)
            )
            sleep(0.500)

        # task_list = [
            #     asyncio.create_task(
            #         download_to_database(
            #             news_meta_list['url'][i+idx],
            #             news_meta_list['id'][i+idx],
            #             get_file_name(i+idx)
            #         )
            #     ) for idx in range(0,10 if len(news_meta_list) > i + 10 else len(news_meta_list) % 10)
            # ]
            # await asyncio.gather(*task_list)
        except Exception as e:
            print(f"ERROR DOWNLOADING ... RETRYING...")
            # failed_tasks = [
            #     asyncio.create_task(
            #         download_to_database(
            #             news_meta_list['url'][i+idx],
            #             news_meta_list['id'][i+idx],
            #             get_file_name(i+idx)
            #         )
            #     ) for idx in range(0,10 if len(news_meta_list) > i + 10 else len(news_meta_list) % 10)
            # ]
            # await asyncio.gather(*failed_tasks)

        if i % 50 == 0:
            sleep(3)

        sleep(0.300)

await load_article("Tesla")

Downloading 1 , https://www.reuters.com/business/autos-transportation/china-based-ev-makers-hit-with-european-union-tariffs-2024-06-12/
Error downloading article 1 | 401
Downloading 2 , https://www.reuters.com/sustainability/decarbonizing-industries/ev-sales-slip-can-auto-industry-navigate-bumps-road-net-zero-2024-06-12/
Error downloading article 2 | 401
Downloading 3 , https://www.reuters.com/legal/tesla-shareholder-sues-musk-return-billions-alleged-unlawful-profits-2024-06-11/
Error downloading article 3 | 401
Downloading 4 , https://www.reuters.com/business/autos-transportation/chinas-nio-says-commitment-europe-ev-market-unwavering-despite-increased-tariffs-2024-06-12/
Error downloading article 4 | 401
Downloading 5 , https://www.reuters.com/sustainability/climate-energy/orsted-install-tesla-battery-uk-offshore-wind-farm-2024-06-12/
Error downloading article 5 | 401
Downloading 6 , https://www.reuters.com/business/autos-transportation/swedish-fund-manager-vote-against-musks-56-bln-t

CancelledError: 

In [17]:
import requests
from bs4 import BeautifulSoup
import random

# Define parameters provided by Brightdata
host = 'brd.superproxy.io'
port = 22225
username = 'brd-customer-hl_3249edde-zone-datacenter_proxy1'
password = 'xgej83znlhdn'
session_id = random.random()

# format your proxy
proxy_url = ('http://{}-session-{}:{}@{}:{}'.format(username, session_id,
                                                    password, host, port))

# define your proxies in dictionary
proxies = {'http': proxy_url, 'https': proxy_url}

# Send a GET request to the website
url = "https://brightdata.com/"
response = requests.get(url, proxies=proxies)

# Use BeautifulSoup to parse the HTML content of the website
soup = BeautifulSoup(response.content, "html.parser")

# Find all the links on the website
links = soup.find_all("a")

# Print all the links
for link in links:
    print(link.get("href"))

ProxyError: HTTPSConnectionPool(host='brightdata.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))