In [2]:
import requests
import json
import pandas as pd
from time import sleep

user_key = "LKbI9fY6ZEPU8RcJUxQHjwKbld52WGt0"

base_url = "https://api.dowjones.com"
headers = {
    "user-key": user_key,
    "Content-Type": "application/json",
    "X-API-VERSION": "3.0"
}

def check_response(response):
    if response.status_code in [200, 201]:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

In [12]:
def create_explain(query_body):
    """
    Create an explain request to check how many documents match a query
    """
    url = f"{base_url}/extractions/documents/_explain"
    response = requests.post(url, headers=headers, data=json.dumps(query_body))
    return check_response(response)

def check_explain_status(job_id):
    """
    Check the status of an explain job
    """
    url = f"{base_url}/extractions/documents/{job_id}/_explain"
    response = requests.get(url, headers=headers)
    return check_response(response)

def wait_for_explain_completion(job_id, max_retries=100, sleep_time=10):
    """
    Wait for an explain job to complete
    """
    for i in range(max_retries):
        status = check_explain_status(job_id)
        if not status:
            return None
            
        current_state = status['data']['attributes']['current_state']
        print(f"Current state: {current_state}")
        
        if current_state == "JOB_STATE_DONE":
            return status
        
        if i < max_retries - 1:
            print(f"Waiting {sleep_time} seconds...")
            sleep(sleep_time)
    
    print("Max retries reached. Job might still be running.")
    return None

In [36]:
industry_codes = [
    "i814", "i815", "i82", "i831", "iabls", "ialtinv", "ibnk", "ibusdev", "icaslty", 
    "ichalbk", "iclins", "icrowfd", "iextrfu", "ifinal", "ifmsoft", "igovspon", 
    "ihedge", "iibnk", "iinsurt", "iinv", "ipension", "iplastic", "ippf", "ipricr", 
    "iprivhea", "irbank", "iresinv", "isover", "iventure", "iwealth"
]

carveout_keywords = [
    "portfolio", "strategic actions",
    "review of strategic alternatives", "divestment", "divestiture", "divest", 
    "divesting", "disposal", "non-core asset", "focus on core assets", 
    "refocus on core assets", "reduce leverage", "improve liquidity", 
    "bonds maturing", "notes maturing", "debt maturing", "rationalize costs",
    "restructuring program", "in talks to divest", "capital return",
    "sold", "divested", "signed an agreement to sell", "entered into an agreement to sell",
    "special dividend",  "simplification",
    
    "carve-out", "spin-off", "strategic review", "portfolio optimization",
    "asset sale", "streamline operations", "exit markets", "discontinue operations",
    "strategic disposal", "non-strategic assets", "shed assets", "optimize footprint"
]

keyword_conditions = " OR ".join([f"body LIKE '%{keyword}%'" for keyword in carveout_keywords])

query = {
    "query": {
        "where": f"language_code='en' AND publication_datetime >= '2020-01-01 00:00:00' AND ({keyword_conditions})",
        "includes": {
            "industry_codes": industry_codes,
            "region_codes": ["eurz"]
        },
    }
}


explain_result = create_explain(query)

if explain_result:
    print("\nExplain job created:")
    print(f"Job ID: {explain_result['data']['id']}")
    
    job_id = explain_result['data']['id']
    final_status = wait_for_explain_completion(job_id)
    
    if final_status and 'counts' in final_status['data']['attributes']:
        count = final_status['data']['attributes']['counts']
        print(f"\nNumber of documents matching the query: {count}")


Explain job created:
Job ID: 006a8d45-6d1f-4bdb-9343-81b4f5111034
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds...
Current state: JOB_STATE_RUNNING
Waiting 10 seconds..

In [32]:
# Get a few sample documents to review
explain_job_id = "108f5c9a-639d-425e-8873-fc9d7b746a7f"
samples_url = f"https://api.dowjones.com/extractions/samples/{explain_job_id}?num_samples=5"
samples_response = requests.get(samples_url, headers=headers)
samples = samples_response.json()

In [33]:
samples

{'data': {'id': '108f5c9a-639d-425e-8873-fc9d7b746a7f',
  'type': 'explain',
  'attributes': {'counts': 5,
   'current_state': 'JOB_STATE_DONE',
   'sample': [{'an': 'OHFSEC0020240612ek6c0002u',
     'company_codes': ',acptef,acptef,amzcom,amzcom,andac,andac,apcbac,apcbac,auifch,auifch,binter,eucmm,euruno,euruno,farada,farada,gognew,gognew,hpjqvv,hpjqvv,iesesp,iesesp,intth,intth,prwth,prwth,qexwvp,qexwvp,rcktig,rcktig,unchld,unpom,unpom,unwest,unwest,vubfub,vubfub,',
     'company_codes_about': '',
     'company_codes_occur': ',vubfub,unwest,unpom,rcktig,qexwvp,prwth,intth,iesesp,hpjqvv,gognew,farada,euruno,auifch,apcbac,andac,amzcom,acptef,',
     'industry_codes': '',
     'ingestion_datetime': '2024-06-12T18:30:34.000Z',
     'modification_datetime': '2024-06-12T18:30:34.000Z',
     'publication_datetime': '2024-06-12T00:00:00.000Z',
     'publisher_name': 'Brownstein & Egusa',
     'region_codes': ',castil,eecz,eurz,lamz,medz,spain,weurz,',
     'region_of_origin': 'EEURZ EUR HUNG 

In [3]:
def get_taxonomy(taxonomy_type="industries", file_type="csv"):
    url = f"{base_url}/taxonomies/{taxonomy_type}/{file_type}"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        if file_type == "csv":
            from io import StringIO
            return pd.read_csv(StringIO(response.text))
        return response.text
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

# Get financial services industry codes
financial_taxonomy = get_taxonomy("industries", "csv")

In [10]:
relevant_df.to_csv("/Users/sylvestrsemesko/Desktop/stuff.csv", index=False)

In [6]:
# Filter for financial services related terms
financial_terms = ['bank', 'financ', 'invest', 'insur', 'asset', 'capital', 
                   'credit', 'fund', 'wealth', 'broker', 'payment']

# Create a case-insensitive filter for any of these terms
filtered_df = financial_taxonomy[financial_taxonomy['description'].str.lower().str.contains('|'.join(financial_terms), na=False)]


In [8]:
irrelevant_codes = [
    # Real Estate focused
    'i834',
    # Highly specialized insurance
    'iagins', 'ipetin', 'itvlins', 'itfins', 'itci', 'imoins', 'islins', 'icatins',
    # Non-financial services
    'iinvest',
    # Highly specialized financial
    'iislam', 'imicro'
]

# Filter out the irrelevant codes
relevant_df = filtered_df[~filtered_df['code'].isin(irrelevant_codes)]

In [9]:
relevant_df.to

Unnamed: 0,code,description
398,i814,Banking
399,i81401,Central Banking
400,i81402,Commercial Banking
402,i81404,Digital Banking
403,i81501,Credit Types/Services
...,...,...
864,irbank,Regional Banks
871,iresinv,Sustainable Investment
895,isover,Sovereign Wealth Fund
931,iventure,Venture Capital
