# Reuters Data fetch and save

This notebook downloads all the data from the reuters about a specific topic and saves it to the database

In [16]:
from lxml import etree
from io import StringIO, BytesIO

from ipywidgets import IntProgress
from IPython.display import clear_output, display

import requests
import math
import ray


In [17]:
MAX_PAGE_SIZE = 50
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'
PAGES_REQUESTS_SIZE = 10    # the amount of request ran in parallel to get pages faster, 10 pages loaded at the same time

In [22]:
from time import sleep

'''
This functions queries reuters and gets the first few articles
This query contains the 
'''

num_cpus = 4
    
ray.init(ignore_reinit_error=True, num_cpus=num_cpus)

def get_query_param(keyword, offset):
    # print(offset)
    
    return {
        "query": '{"keyword":"%s","offset":%d,"orderby":"display_date:desc","size":%d,"website":"reuters"}' % (keyword, offset, MAX_PAGE_SIZE),
        "d": 179,
        "_website": "reuters"
    }

def append_articles(articles_list, query_list):
    requests_keys = ["id","canonical_url","title","published_time"]

    for query_articles in query_list:
        articles_list.append({key: query_articles[key] for key in requests_keys})

    return articles_list

# resources = {"requests_resource": 1}, num_cpus=num_cpus
@ray.remote
def try_request(url, headers, params):
        
    response_page = requests.request("GET", url, headers=headers, params=params)
    try:
        json_response = response_page.json()
                
        if response_page.status_code > 300:
            print("Status code error: " + str(response_page.status_code))
            return False, None

        return True, json_response
    except requests.exceptions.JSONDecodeError as e:
        print(f"Bad Request: GET {url} \n Status Code: {response_page.status_code} | Error : {e}")
        return False, None

def query_reuters_news_meta(keyword):

    # full query list
    query_list = []

    # build header. has just the cookie
    headers = {
        'Cookie': 'datadome=5DoGfdp_v7Y64hF3yeg8zmFRJvUZ_kY9APCBHhhY8Q5HnPWurio8XinFbT~guJr5RlIj0N41S2aQQNBEiMf05rg2Mb3BBC9zMM7Akw1aS8O9285Y~CC1YlyPQoYKfgLi; reuters-geo={"country":"AT", "region":"-"}'
    }

    # build the query params. "query" property must be a string of JSON, not an actual dict. must contain keyword and max page size
    params = {
        "query": '{"keyword":"%s","offset":0,"orderby":"display_date:desc","size":%d,"website":"reuters"}' % (keyword,MAX_PAGE_SIZE),
        "d": 179,
        "_website": "reuters"
    }
    
    # make the first request
    response_first = requests.request("GET", REUTERS_QUERY_URL, headers=headers, params=params).json()
    
    # query_list.append({key: article[key] for key in requests_keys})

    # print("Request done")
    # check if the request was valid by the given status code, else raise exception
    if response_first['statusCode'] == 400:
        raise Exception(f"Bad Request: Query size of {MAX_PAGE_SIZE} is bigger than 100 with is not valid." )

    # get pegination data and load all pages
    query_result_total_size = response_first["result"]["pagination"]["total_size"] 
    print("Query size is ", query_result_total_size)
    
    total_page_size = math.ceil(query_result_total_size / MAX_PAGE_SIZE)
    
    # print("total_page_size is ", total_page_size)

    pages_range = math.ceil(total_page_size / PAGES_REQUESTS_SIZE) + 1

    # print("pages_range is ", pages_range)
    
    f = IntProgress(min=0, max=pages_range) # instantiate the bar

    # add first reponse to query list
    query_list = append_articles(query_list, response_first["result"]["articles"])

    display(f) # display the bar
    f.value += 1

    #start at two and get all the messages available
    # we will load 10 pages in parallel and  until we have all the pages.
    last_offset = 0
    
    for i in range(1,pages_range+1):
        print("\rLoading page %d / %d" % (i, pages_range), end="")

        tasks = []
        for _ in range(1,PAGES_REQUESTS_SIZE):
            offset = MAX_PAGE_SIZE + last_offset
            tasks.append(
                try_request.remote(
                    REUTERS_QUERY_URL, 
                    params = get_query_param(keyword=keyword, offset=offset), 
                    headers = headers)
            )
            last_offset = offset
        
        results_list = ray.get(tasks)
        
        for (request_ok, json_parsed) in results_list:
            if request_ok:
                try:
                    query_list = append_articles(query_list, json_parsed["result"]["articles"])
                except Exception as e:
                    print(f"Error getting articles! {str(json_parsed)}")
                    pass
            else:
                print(f"Error at task {last_offset}")

        f.value += 1

    return query_list

meta_list = query_reuters_news_meta("Tesla")
print("\n Result Size: ", len(meta_list))


2024-03-08 15:40:33,142	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...
2024-03-08 15:40:33,143	INFO worker.py:1558 -- Calling ray.init() again after it has already been called.


Query size is  5102
total_page_size is  103
pages_range is  12


IntProgress(value=0, max=12)

Loading page 1 / 1250
100
150
200
250
300
350
400
450
Loading page 2 / 12500
550
600
650
700
750
800
850
900
Loading page 3 / 12950
1000
1050
1100
1150
1200
1250
1300
1350
Loading page 4 / 121400
1450
1500
1550
1600
1650
1700
1750
1800
Loading page 5 / 121850
1900
1950
2000
2050
2100
2150
2200
2250
Loading page 6 / 122300
2350
2400
2450
2500
2550
2600
2650
2700
Loading page 7 / 122750
2800
2850
2900
2950
3000
3050
3100
3150
Loading page 8 / 123200
3250
3300
3350
3400
3450
3500
3550
3600
Loading page 9 / 123650
3700
3750
3800
3850
3900
3950
4000
4050
Loading page 10 / 124100
4150
4200
4250
4300
4350
4400
4450
4500
Loading page 11 / 124550
4600
4650
4700
4750
4800
4850
4900
4950
Loading page 12 / 125000
5050
5100
5150
5200
5250
5300
5350
5400
Error getting articles! {'statusCode': 400, 'result': None, '_id': 'c95c5ade6a77bb806de112e589fa2540cfe383543b2c63d08c9c7a1ff3cdc43f'}
Error getting articles! {'statusCode': 400, 'result': None, '_id': 'c95c5ade6a77bb806de112e589fa2540cfe383543b2c63