In [2]:
import requests
import json
import pandas as pd
from time import sleep

user_key = "LKbI9fY6ZEPU8RcJUxQHjwKbld52WGt0"

base_url = "https://api.dowjones.com"
headers = {
    "user-key": user_key,
    "Content-Type": "application/json",
    "X-API-VERSION": "3.0"
}

def check_response(response):
    if response.status_code in [200, 201]:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

In [17]:
def create_explain(query_body):
    """
    Create an explain request to check how many documents match a query
    """
    url = f"{base_url}/extractions/documents/_explain"
    response = requests.post(url, headers=headers, data=json.dumps(query_body))
    return check_response(response)

def check_explain_status(job_id):
    """
    Check the status of an explain job
    """
    url = f"{base_url}/extractions/documents/{job_id}/_explain"
    response = requests.get(url, headers=headers)
    return check_response(response)

def wait_for_explain_completion(job_id, max_retries=100, sleep_time=10):
    """
    Wait for an explain job to complete
    """
    for i in range(max_retries):
        status = check_explain_status(job_id)
        if not status:
            return None
            
        current_state = status['data']['attributes']['current_state']
        print(f"Current state: {current_state}")
        
        if current_state == "JOB_STATE_DONE":
            return status
        
        if i < max_retries - 1:
            print(f"Waiting {sleep_time} seconds...")
            sleep(sleep_time)
    
    print("Max retries reached. Job might still be running.")
    return None

In [25]:
divest_query = {
    "query": {
        "where": "language_code='en' AND publication_datetime >= '2015-01-01 00:00:00'",
        "includes": {
            "industry_codes": ["i82"]  # Financial services industry code
        },
    }
}

# # Add keywords related to divestment to the query
# divest_keywords = ["divestment", "divestiture", "divest", "non-core asset", 
#                   "portfolio simplification", "business simplification"]

# # Create a keyword condition using LIKE for the body field
# keyword_conditions = " OR ".join([f"body LIKE '%{keyword}%'" for keyword in divest_keywords])
# divest_query["query"]["where"] += f" AND ({keyword_conditions})"

# # Run the explain to see how many documents match
explain_result = create_explain(divest_query)

if explain_result:
    print("\nExplain job created:")
    print(f"Job ID: {explain_result['data']['id']}")
    
    # Wait for the explain job to complete
    job_id = explain_result['data']['id']
    final_status = wait_for_explain_completion(job_id)
    
    if final_status and 'counts' in final_status['data']['attributes']:
        count = final_status['data']['attributes']['counts']
        print(f"\nNumber of documents matching the query: {count}")


Explain job created:
Job ID: b6f430d1-c624-4179-84f5-783a827ebf68


In [23]:
# Get a few sample documents to review
explain_job_id = "41f85d43-2267-410d-a70b-2b4bb290a416"
samples_url = f"https://api.dowjones.com/extractions/samples/{explain_job_id}?num_samples=5"
samples_response = requests.get(samples_url, headers=headers)
samples = samples_response.json()

In [24]:
samples

{'data': {'id': '41f85d43-2267-410d-a70b-2b4bb290a416',
  'type': 'explain',
  'attributes': {'counts': 5,
   'current_state': 'JOB_STATE_DONE',
   'sample': [{'an': 'SOLRADIN20190320ef3k005mt',
     'company_codes': ',cndp,cndp,cndp,edf,',
     'company_codes_about': ',edf,cndp,',
     'company_codes_occur': ',cndp,',
     'industry_codes': ',i16,i82,i82002,ieutil,ifinal,iutil,',
     'ingestion_datetime': '2019-03-20T16:10:12.000Z',
     'modification_datetime': '2019-03-20T16:10:12.000Z',
     'publication_datetime': '2019-03-20T16:09:31.000Z',
     'availability_datetime': '2019-03-20T17:40:26.360Z',
     'publisher_name': 'Il Sole 24 Ore SpA',
     'region_codes': ',uk,italy,eecz,eurz,medz,weurz,',
     'region_of_origin': 'EUR ITALY MEDZ WEURZ ',
     'source_code': 'SOLRADIN',
     'source_name': '24 Ore Radiocor-Newswire International Edition',
     'subject_codes': ',c151,c15,ccat,ncat,nfact,nfcpin,',
     'title': 'RTE FY net 603 mln eur, up 62%',
     'word_count': 145,
    