In [1]:
import pandas as pd
import requests
import json
import ast
from lxml import etree as ElementTree
import uuid
import time
import jwt
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
CLIENT_ID = "INSERT ID"
CLIENT_SECRET = "INSERT CLIENT SECRET"

In [1447]:
#generate jwt assertion
def get_assertion():
    audience = "https://ha.viss.gov.lv/STS/VISS.Pfas.STS/oauth2/token"

    with open('KEYNAME.key', 'rb') as privatefile:
        private_key = privatefile.read()
        public_key = """INSERT KEY"""

    # Header for JWT
    header = {
        "typ": "JWT",
        "alg": "RS256",
        "x5c": public_key
    }

    # Claim Set
    claim_set = {
        "sub": CLIENT_ID,
        "jti": str(uuid.uuid4()),
        "iss": CLIENT_ID,
        "aud": audience,
        "exp": int(time.time()) + 3600,
        "nbf": int(time.time())
    }

    # Generate the JWT
    assertion = jwt.encode(claim_set, private_key, algorithm="RS256", headers=header)

    return assertion

In [4]:
#get token and transaction ID

def get_auth():
    assertion = get_assertion()
    url = "https://apigw.viss.gov.lv/token"
    payload = f'client_assertion_type=urn%3Aietf%3Aparams%3Aoauth%3Aclient-assertion-type%3Ajwt-bearer&client_assertion={assertion}&grant_type=client_credentials&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}'
    response = requests.request("POST", url, data=payload)
    Token = ast.literal_eval(response.text)['access_token']
    
    Authorization = 'Bearer ' + Token
    return Authorization

In [1570]:
#Get legal entity info
Authorization = get_auth()
url = "https://apigw.viss.gov.lv/legalentity/v1.0/legal-entity/40203406251/"

headers = {'Authorization':Authorization}

response = requests.get(url, headers=headers)

data = json.loads(response.text)

In [1632]:
#Get company annual reports
Authorization = get_auth()
url = "https://apigw.viss.gov.lv/legalentity/v1.0/legal-entity/40008259890/annual-reports"
headers = {
  'Authorization': Authorization
}

annual_reports = requests.get(url, headers=headers)
list_of_jsons = json.loads(annual_reports.text)

In [1689]:
#Extract all fin reports

def get_report_content(report_number, authorization):
    url = f"https://apigw.viss.gov.lv/annualreport/v1.0/annual-report/{report_number}/content"
    headers = {
    'Authorization': Authorization
    }
    response = requests.get(url, headers=headers)
    data = response.text.split('<PielikumsPazimes>',1)[0]
    report_names = ['BA','BP','PZA1','PZA2','NPP1','NPP2','PKIP']
    reports = {}
    
    for report_name in report_names:
        report_start = f'<{report_name}Beigas>'
        report_end = f'</{report_name}Beigas>'

        if report_start not in data:
            continue
        xml_text = data[data.index(report_start):data.index(report_end)+len(report_end)]
        parser = ElementTree.XMLParser(recover=True)
        tree = ElementTree.fromstring(xml_text, parser)
        for elem in tree:
            if 'Apaksposteni' in elem.tag:
                for child in elem:
                    if child.text is not None:
                        reports[f'{report_name}_{elem.tag.replace('Apaksposteni','')}_{child.tag}'] = child.text
            elif elem.text is not None:
                reports[f'{report_name}_{elem.tag}']
    return reports

def fetch_report(report,authorization):
    try:
        report.update(get_report_content(report['file_id'],authorization))
    except Exception as Error:
        print(Error, report)
    return report

def get_annual_reports(year, page_start=0,pages_max=None):
    Authorization = get_auth()
    YEAR=year
    url = 'https://apigw.viss.gov.lv/annualreports/v1.0/annual-reports'
    headers = {
    'Authorization': Authorization
    }
    params = {
        'order':'DESC',
        'size':10000,
        'lastChangedAt': f'{int(YEAR)}-01-01T00:00:00',
        'page': page_start,
    }
    all_reports = []
    time3 = time.time()
    token_time = time.time()
    response = requests.get(url, headers=headers,params=params)
    data = response.json()
    if pages_max is not None:
        max_pages = pages_max
    else:
        max_pages = int(data['page']['totalPages'])
        print('max_pages',max_pages)
    for pages in range(max_pages):
        time1 = time.time()
        if (time1-token_time)>3300:
            print('changing token')
            token_time = time.time()
            headers['Authorization'] = get_auth()
            headers['page'] = str(page)
        response = requests.get(url,headers=headers,params=params)
        data = response.json()

        report_list = data['_embedded']['annualReportList']
        annual_report_list = []
        with ThreadPoolExecutor(max_workers=25) as executor:
            for report in report_list:
                try:
                    try:
                        report['removedOn']
                        continue
                    except KeyError:
                        if ((report['financialData']['sourceType'] in ['UGP','UKGP']) & (report['type'] in ['ANNUAL_REPORT','CONSOLIDATED_STATEMENT']) & (report['isAnnulled']==False) & (report['year']==YEAR)):
                            annual_report_list.append({'reg_nr':report['legalEntityRegistrationNumber'],'file_id':report['fileId'],'type':report['financialData']['sourceType']})
                except KeyError:
                    continue
                except TypeError:
                    continue
            futures = [executor.submit(fetch_report,report, Authorization) for report in annual_report_list]

            for future in as_completed(futures):
                result = future.result()
                if result:
                    all_reports.append(result)
                if len(all_reports)%500==0:
                    print('Done reports: ', len(all_reports))
            if 'next' not in data['_links']:
                break

        print('page', params['page'])
        params['page'] +=1
        time2 = time.time()
        time_delta = time2 - time1
        print('total', time2-time3, 'this page', time_delta)
    return all_reports


In [None]:
reps = get_annual_reports('2023',page_start=0,pages_max=10)