## Extracting articles metadata without downloading its content
* use keywords and industry codes discussed with Sophie
* check possibility and limits of Snapshots API

In [1]:
import requests
import json
import pandas as pd
from time import sleep
from pprint import pprint

user_key = "LKbI9fY6ZEPU8RcJUxQHjwKbld52WGt0"

base_url = "https://api.dowjones.com"
headers = {
    "user-key": user_key,
    "Content-Type": "application/json",
    "X-API-VERSION": "3.0"
}

def check_response(response):
    if response.status_code in [200, 201]:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

In [2]:
def create_explain(query_body):
    """
    Create an explain request to check how many documents match a query
    """
    url = f"{base_url}/extractions/documents/_explain"
    response = requests.post(url, headers=headers, data=json.dumps(query_body))
    return check_response(response)

def check_explain_status(job_id):
    """
    Check the status of an explain job
    """
    url = f"{base_url}/extractions/documents/{job_id}/_explain"
    response = requests.get(url, headers=headers)
    return check_response(response)

def wait_for_explain_completion(job_id, max_retries=100, sleep_time=10):
    """
    Wait for an explain job to complete
    """
    for i in range(max_retries):
        status = check_explain_status(job_id)
        if not status:
            return None
            
        current_state = status['data']['attributes']['current_state']
        print(f"Current state: {current_state}")
        
        if current_state == "JOB_STATE_DONE":
            return status
        
        if i < max_retries - 1:
            print(f"Waiting {sleep_time} seconds...")
            sleep(sleep_time)
    
    print("Max retries reached. Job might still be running.")
    return None

def run_explain(input_query, num_samples=5):
    explain_result = create_explain(input_query)

    if explain_result:
        print("\nExplain job created:")
        print(f"Job ID: {explain_result['data']['id']}")
        
        job_id = explain_result['data']['id']
        final_status = wait_for_explain_completion(job_id)

        print(f"\nGetting samples for job ID: {job_id}")
        if final_status and 'counts' in final_status['data']['attributes']:
            count = final_status['data']['attributes']['counts']
            print(f"\nNumber of documents matching the query: {count}")
            
            
            samples_url = f"https://api.dowjones.com/extractions/samples/{job_id}?num_samples={num_samples}"
            samples_response = requests.get(samples_url, headers=headers)
            samples = samples_response.json()
            pprint(samples,compact=True)
            return final_status, samples

### Making explain queries
v0: ton of keywords and industries - leads to lot of irrelevant stuff

In [3]:
industry_codes = [
    "i814", "i815", "i82", "i831", "iabls", "ialtinv", "ibnk", "ibusdev", "icaslty", 
    "ichalbk", "iclins", "icrowfd", "iextrfu", "ifinal", "ifmsoft", "igovspon", 
    "ihedge", "iibnk", "iinsurt", "iinv", "ipension", "iplastic", "ippf", "ipricr", 
    "iprivhea", "irbank", "iresinv", "isover", "iventure", "iwealth"
]

data_providers_industry_codes = ["i8395463", "ifinal", "iplastic", "i83102","i83109","iadmin"]

industry_codes.extend(data_providers_industry_codes)
industry_codes = list(set(industry_codes))  # Remove duplicates

carveout_keywords = [
    "portfolio", "strategic actions",
    "review of strategic alternatives", "divestment", "divestiture", "divest", 
    "divesting", "disposal", "non-core asset", "focus on core assets", 
    "refocus on core assets", "reduce leverage", "improve liquidity", 
    "bonds maturing", "notes maturing", "debt maturing", "rationalize costs",
    "restructuring program", "in talks to divest", "capital return",
    "special dividend",  "simplification",   
    "carve-out", "spin-off", "strategic review", "portfolio optimization",
    "asset sale", "streamline operations", "exit markets", "discontinue operations",
    "strategic disposal", "non-strategic assets", "shed assets", "optimize footprint",
    "new CEO", "management change", "leadership change", "new management team",
    #TODO: add more keywords related to legal, management, strategy
]

keyword_conditions = " OR ".join([f"body LIKE '%{keyword}%'" for keyword in carveout_keywords])

query = {
    "query": {
        "where": f"language_code='en' AND publication_datetime >= '2024-05-01 00:00:00' AND ({keyword_conditions})",
        "includes": {
            "industry_codes": industry_codes,
            "region_codes": ["eurz"]
        },
    }
}

run_explain(query)


Explain job created:
Job ID: c87eb315-fda0-440b-bb64-382235bd5e53
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_DONE

Getting samples for job ID: c87eb315-fda0-440b-bb64-382235bd5e53

Number of documents matching the query: 813808
{'data': {'attributes': {'counts': 5,
                         'current_state': 'JOB_STATE_DONE',
                         'sample': [{'an': 'OHFSEC0020240702ek7200001',
                                     'company_codes': ',omniho,omniho,',
                                     'company_codes_about': '',
                                     'company_codes_occur': ',omniho,',
                         

({'data': {'id': 'c87eb315-fda0-440b-bb64-382235bd5e53',
   'type': 'explain',
   'attributes': {'counts': 813808, 'current_state': 'JOB_STATE_DONE'}},
  'links': {'self': 'https://api.dowjones.com/extractions/documents/c87eb315-fda0-440b-bb64-382235bd5e53/_explain'}},
 {'data': {'id': 'c87eb315-fda0-440b-bb64-382235bd5e53',
   'type': 'explain',
   'attributes': {'counts': 5,
    'current_state': 'JOB_STATE_DONE',
    'sample': [{'an': 'OHFSEC0020240702ek7200001',
      'company_codes': ',omniho,omniho,',
      'company_codes_about': '',
      'company_codes_occur': ',omniho,',
      'industry_codes': ',i98203,ibcs,iscsv,',
      'ingestion_datetime': '2024-07-02T18:31:51.000Z',
      'modification_datetime': '2024-07-02T18:31:51.000Z',
      'publication_datetime': '2024-07-02T00:00:00.000Z',
      'publisher_name': 'Brownstein & Egusa',
      'region_codes': ',eurz,',
      'region_of_origin': 'EEURZ EUR HUNG ',
      'source_code': 'OHFSEC',
      'source_name': '150sec',
      'su

### Version 1: intent+action for future-oriented search

In [None]:
action_tags = [
  "%cactio%", "%cspinoff%", "%cdivest%", "%cmger%", "%crestruc%"
]
tag_clause = " OR ".join([f"subject_codes LIKE '{t}'" for t in action_tags])

# 2) Intent & action (with extra verbs)
intents = ["plan to","intend to","looking to","considering","mulling","weighing"]
actions = ["divest","sell","spin-off","dispose","carve-out"]
intent_clause = " OR ".join([f"body LIKE '%{i}%'" for i in intents])
action_clause = " OR ".join([f"body LIKE '%{a}%'" for a in actions])


where = (
"language_code='en' " 
"AND publication_datetime >= '2024-12-01 00:00:00' "
"AND region_codes LIKE '%eurz%' "
f"AND ({tag_clause}) "
f"AND ({intent_clause}) "
f"AND ({action_clause})"
)
query = {
  "query": {
    "where": where,
    "includesList":{
      "industry_codes": ["2ce88edb-3f5e-43c5-bf4b-48eb22624ff1"],
    },
  }
}

status, samples = run_explain(query)


Explain job created:
Job ID: 8de60215-3280-4f03-a27e-af67c127206a
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...


### Writing extraction query and getting the data! **DANGER - do not execute due to limits**
Dangerous cells will be commented out

### Viewing samples

In [None]:
explain_job_id = "fc06f298-dba6-4002-b4e8-06819322f34b"
samples_url = f"https://api.dowjones.com/extractions/samples/{explain_job_id}?num_samples=5"
samples_response = requests.get(samples_url, headers=headers)
samples = samples_response.json()

In [None]:
samples

{'data': {'id': 'fc06f298-dba6-4002-b4e8-06819322f34b',
  'type': 'explain',
  'attributes': {'counts': 5,
   'current_state': 'JOB_STATE_DONE',
   'sample': [{'an': 'OHFSEC0020240524ek5m00001',
     'company_codes': ',omniho,omniho,',
     'company_codes_about': '',
     'company_codes_occur': ',omniho,',
     'industry_codes': ',i41,i412,i4122,i4221,i4222,icnp,ifood,',
     'ingestion_datetime': '2024-05-24T00:30:29.000Z',
     'modification_datetime': '2024-05-24T00:30:29.000Z',
     'publication_datetime': '2024-05-22T00:00:00.000Z',
     'publisher_name': 'Brownstein & Egusa',
     'region_codes': ',eurz,uk,weurz,',
     'region_of_origin': 'EEURZ EUR HUNG ',
     'source_code': 'OHFSEC',
     'source_name': '150sec',
     'subject_codes': ',c22,ccat,cemis,centrp,cenvire,cesg,cexpro,cpartn,csmlbs,ncat,nfact,nfcpex,nfcpin,',
     'title': 'UK’s Pet-Tech Venture Launches Cultivated Chicken And Other Meat Products',
     'word_count': 490,
     'newswires_codes': '',
     'restrictor