# Reuters Data fetch and save

This notebook downloads all the data from the reuters about a specific topic and saves it to the database

In [31]:
import html5lib
from io import StringIO, BytesIO
import lxml.html
from lxml import html
import xml.etree.ElementTree as ET

from ipywidgets import IntProgress
from IPython.display import clear_output, display
from bs4 import BeautifulSoup


import requests
import math
import ray


In [43]:
MAX_PAGE_SIZE = 50
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'
PAGES_REQUESTS_SIZE = 5    # the amount of request ran in parallel to get pages faster, 10 pages loaded at the same time

In [57]:
from time import sleep

'''
This functions queries reuters and gets the first few articles
This query contains the 
'''

num_cpus = 4
    
ray.init(ignore_reinit_error=True, num_cpus=num_cpus)

def get_query_param(keyword, offset):
    print(offset)
    
    return {
        "query": '{"keyword":"%s","offset":%d,"orderby":"display_date:desc","size":%d,"website":"reuters"}' % (keyword, offset, MAX_PAGE_SIZE),
        "d": 179,
        "_website": "reuters"
    }

def append_articles(articles_list, query_list):
    requests_keys = ["id","canonical_url","title","published_time"]

    for query_articles in query_list:
        articles_list.append({key: query_articles[key] for key in requests_keys})

    return articles_list

# resources = {"requests_resource": 1}, num_cpus=num_cpus
@ray.remote
def try_request(url, headers, params):
        
    response_page = requests.request("GET", url, headers=headers, params=params, timeout=30)
    try:
        json_response = response_page.json()
                
        if response_page.status_code > 300:
            print("Status code error: " + str(response_page.status_code))
            return False, params

        return True, json_response
    except requests.exceptions.JSONDecodeError as e:
        print(f"Bad Request: GET {url} \n Status Code: {response_page.status_code} | Error : {e}")
        return False, params
    except requests.exceptions.Timeout:
        print("Timed out")
        return False, params
    except requests.exceptions.RequestException as e:
        print(f"Bad Request: GET {url} \n")
        return False, params

def query_reuters_news_meta(keyword):

    # full query list
    query_list = []

    # build header. has just the cookie
    headers = {
        'Cookie': 'datadome=5DoGfdp_v7Y64hF3yeg8zmFRJvUZ_kY9APCBHhhY8Q5HnPWurio8XinFbT~guJr5RlIj0N41S2aQQNBEiMf05rg2Mb3BBC9zMM7Akw1aS8O9285Y~CC1YlyPQoYKfgLi; reuters-geo={"country":"AT", "region":"-"}'
    }

    # build the query params. "query" property must be a string of JSON, not an actual dict. must contain keyword and max page size
    params = {
        "query": '{"keyword":"%s","offset":0,"orderby":"display_date:desc","size":%d,"website":"reuters"}' % (keyword,MAX_PAGE_SIZE),
        "d": 179,
        "_website": "reuters"
    }
    
    # make the first request
    response_first = requests.request("GET", REUTERS_QUERY_URL, headers=headers, params=params).json()
    
    # query_list.append({key: article[key] for key in requests_keys})

    # print("Request done")
    # check if the request was valid by the given status code, else raise exception
    if response_first['statusCode'] == 400:
        raise Exception(f"Bad Request: Query size of {MAX_PAGE_SIZE} is bigger than 100 with is not valid." )

    # get pegination data and load all pages
    query_result_total_size = response_first["result"]["pagination"]["total_size"] 
    print("Query size is ", query_result_total_size)
    
    total_page_size = math.ceil(query_result_total_size / MAX_PAGE_SIZE)
    
    print("total_page_size is ", total_page_size)

    pages_range = math.ceil(total_page_size / PAGES_REQUESTS_SIZE)

    print("pages_range is ", pages_range)
    
    f = IntProgress(min=0, max=pages_range) # instantiate the bar

    # add first response to query list
    # query_list = append_articles(query_list, response_first["result"]["articles"])

    display(f) # display the bar
    f.value += 1

    #start at two and get all the messages available
    # we will load 10 pages in parallel and  until we have all the pages.
    last_offset = 0
    
    for i in range(pages_range):
        print("\rLoading page %d / %d" % (i+1, pages_range), end="")

        tasks = []
        for _ in range(PAGES_REQUESTS_SIZE):
            offset = MAX_PAGE_SIZE + last_offset
            tasks.append(
                try_request.remote(
                    REUTERS_QUERY_URL, 
                    params = get_query_param(keyword=keyword, offset=offset), 
                    headers = headers)
            )
            last_offset = offset
        
        results_list = ray.get(tasks)
        
        for (request_ok, json_parsed) in results_list:
            if request_ok:
                try:
                    print(len(json_parsed["result"]["articles"]))
                    query_list = append_articles(query_list, json_parsed["result"]["articles"])
                except Exception as e:
                    print(f"Error getting articles! {str(json_parsed)}")
                    pass
            else:
                print(f"Error at task {last_offset}")

        f.value += 1

    return query_list

meta_list = query_reuters_news_meta("Tesla")
print("\n Result Size: ", len(meta_list))


2024-03-12 20:50:07,459	INFO worker.py:1558 -- Calling ray.init() again after it has already been called.


Query size is  5117
total_page_size is  103
pages_range is  21


IntProgress(value=0, max=21)

Loading page 1 / 2150
100
150
200
250
50
50
50
50
50
Loading page 2 / 21300
350
400
450
500
50
50
50
50
50
Loading page 3 / 21550
600
650
700
750
50
50
50
50
50
Loading page 4 / 21800
850
900
950
1000
49
50
49
50
50
Loading page 5 / 211050
1100
1150
1200
1250
50
50
50
50
50
Loading page 6 / 211300
1350
1400
1450
1500
50
50
50
50
50
Loading page 7 / 211550
1600
1650
1700
1750
50
50
50
50
50
Loading page 8 / 211800
1850
1900
1950
2000
50
50
50
50
50
Loading page 9 / 212050
2100
2150
2200
2250
50
49
50
50
50
Loading page 10 / 212300
2350
2400
2450
2500
50
50
50
50
50
Loading page 11 / 212550
2600
2650
2700
2750
50
50
50
50
50
Loading page 12 / 212800
2850
2900
2950
3000
50
50
50
50
50
Loading page 13 / 213050
3100
3150
3200
3250
50
50
50
50
49
Loading page 14 / 213300
3350
3400
3450
3500
50
50
50
50
50
Loading page 15 / 213550
3600
3650
3700
3750
50
50
49
50
50
Loading page 16 / 213800
3850
3900
3950
4000
50
48
50
49
50
Loading page 17 / 214050
4100
4150
4200
4250
50
48
48
50
49
Loading p

In [34]:
import xml.etree.ElementTree

def load_single_article_text(canonical_url):
    # for meta in meta_list:
    url = f"https://www.reuters.com{canonical_url}"
    
    payload = {}
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8,de;q=0.7',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        'Cookie': 'datadome=5DoGfdp_v7Y64hF3yeg8zmFRJvUZ_kY9APCBHhhY8Q5HnPWurio8XinFbT~guJr5RlIj0N41S2aQQNBEiMf05rg2Mb3BBC9zMM7Akw1aS8O9285Y~CC1YlyPQoYKfgLi; reuters-geo={"country":"-", "region":"-"}'
    }
    
    response = requests.request("GET", url, headers=headers, data=payload)
    
    # get content in article
    index_article_start = response.text.index("<article")
    index_article_end = response.text.index("</article>")
    article_text_html =  response.text[index_article_start:index_article_end+10]
    
    bs = BeautifulSoup(article_text_html, "xml")
    
    full_text = ""
    
    for EachPart in bs.select('div[class*="article-body__content__"]'):
        for paragraph in EachPart.parent.select('div[data-testid*="paragraph-"]'):
            full_text += " " + paragraph.text
        
    full_text = full_text.replace("\n","")
    
    print(full_text)
    
# load_single_article_text("/business/aerospace-defense/us-preparing-new-weapons-package-ukraine-officials-2024-03-12/")

def load_from_meta_list_into_db(articles):
    for article in articles:
        load_single_article_text(article["canonical_url"])

load_from_meta_list_into_db(meta_list)


NameError: name 'meta_list' is not defined