In [11]:
import pandas as pd
import duckdb

zpravy = pd.read_pickle("pickle/zpravy.pkl")
vazby = pd.read_pickle("pickle/vazby.pkl")
dokumentace = pd.read_pickle("pickle/dokumentace.pkl")
materialy = pd.read_pickle("pickle/material.pkl")
vykony = pd.read_pickle("pickle/vykony.pkl")
vykpac = pd.read_pickle("pickle/vykpac.pkl")

res = duckdb.sql(
    """
    with zpravy_a_vazby as (
        select
            z.serial as zprava_serial,
            z.rc,
            z.ambnum,
            z.content,
            date_trunc('day', STRFTIME(STRPTIME(vazby.DATFR, '%d.%m.%Y %H:%M'), '%Y-%m-%dT%H:%M:%S')::timestamp) as datum_a_cas_zpravy,
            vazby.DEPARTM,
            vazby.WHO,
        from zpravy z
            left join vazby using (serial, ambnum)
    )
    , zpravy_a_vazby_a_dokumentace_tmp as (
        select distinct  -- why do we have duplicates
            datum_a_cas_zpravy,
            date_trunc('day', strptime(d.DATUM_CAS::text, '%Y%m%d_%H%M%S')) as datum_a_cas_dokumentace,
            zv.ambnum,
            zv.content,
            d.CISPAC,
        from zpravy_a_vazby zv
            asof left join dokumentace d
                on zv.ambnum = d.ambnum
                and date_trunc('day', strptime(d.DATUM_CAS::text, '%Y%m%d_%H%M%S')) >= datum_a_cas_zpravy
    )
    , zpravy_a_vazby_a_dokumentace as (
        select
            datum_a_cas_zpravy,
            ambnum,
            array_agg(distinct content) as contents,
            min(cispac) as cispac,
        from zpravy_a_vazby_a_dokumentace_tmp
        group by 
            datum_a_cas_zpravy,
            ambnum
    )
    , zpravy_a_vazby_a_dokumentace_a_vykony_tmp as (
        select
            date_trunc('day', strptime(datum::text, '%d.%m.%Y %H:%M')) as datum_vykonu,
            z.*,
            u.CDOKL,
            u.kod as kod_vykonu,
            u.odbornost,
            u.mnozstvi as mnozstvi_vykonu,
            u.body,
        from zpravy_a_vazby_a_dokumentace z
            left join vykony u on
                z.cispac = u.cispac
                and date_trunc('day', strptime(datum::text, '%d.%m.%Y %H:%M')) = datum_a_cas_zpravy
    )
    , zpravy_a_vazby_a_dokumentace_a_vykony as (
        select
            datum_a_cas_zpravy,
            contents,
            AMBNUM,
            cispac,
            array_agg(
                struct_pack(
                    CDOKL,
                    kod_vykonu,
                    odbornost,
                    mnozstvi_vykonu,
                    body
                )
            ) as vykony
        from zpravy_a_vazby_a_dokumentace_a_vykony_tmp
        group by 
            datum_a_cas_zpravy,
            contents,
            AMBNUM,
            cispac,
    )
    , zpravy_a_vazby_a_dokumentace_a_vykony_a_materialy_tmp as (
        select
            z.*,
            m.cdokl,
            m.kod as kod_materialu,
            m.mnozstvi as mnozstvi_materialu,
        from zpravy_a_vazby_a_dokumentace_a_vykony z
            left join materialy m on
                z.cispac = m.cispac
                and date_trunc('day', strptime(m.datum::text, '%d.%m.%Y %H:%M')) = datum_a_cas_zpravy
    )
    , zpravy_a_vazby_a_dokumentace_a_vykony_a_materialy as (
        select
            datum_a_cas_zpravy,
            contents,
            AMBNUM,
            cispac,
            vykony,
            array_agg(
                struct_pack(
                    cdokl,
                    kod_materialu,
                    mnozstvi_materialu
                )
            ) as materialy
        from zpravy_a_vazby_a_dokumentace_a_vykony_a_materialy_tmp
        group by 
            datum_a_cas_zpravy,
            contents,
            AMBNUM,
            cispac,
            vykony,
    )
    select *
    from zpravy_a_vazby_a_dokumentace_a_vykony_a_materialy
    order by datum_a_cas_zpravy ASC
    """
).df()
display(res)

Unnamed: 0,datum_a_cas_zpravy,contents,AMBNUM,cispac,vykony,materialy
0,2023-01-01,[MUDr. Kryštofová Dominika\nC163 \nPacientka s...,17345,882713,"[{'CDOKL': 255384, 'kod_vykonu': 42022, 'ODBOR...","[{'CDOKL': 255384, 'kod_materialu': '0007955',..."
1,2023-01-01,[MUDr. Čmejlová Vlastimila\nC504 \nZ511 \nKOnt...,16300,258464,"[{'CDOKL': 256748, 'kod_vykonu': 42022, 'ODBOR...","[{'CDOKL': 256748, 'kod_materialu': '0168721',..."
2,2023-01-01,[MUDr. Čmejlová Vlastimila\nC508 \nKontrola př...,41203,1834132,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."
3,2023-01-01,[MUDr. Novák Tomáš\nC61 \nPET/CT 27.12.2022 \n...,40215,1063949,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."
4,2023-01-02,[MUDr. Danesh Adéla\nC61 \nAmbulantní aplikace...,49135,2040855,"[{'CDOKL': 199836, 'kod_vykonu': 42022, 'ODBOR...","[{'CDOKL': 199836, 'kod_materialu': '0193478',..."
...,...,...,...,...,...,...
66379,NaT,[MUDr. Novák Tomáš\nC713 \nTel. kontakt s pac....,53259,507227,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."
66380,NaT,[MUDr. Kryštofová Dominika\nC498 \nMDT 19.12.2...,59981,2288731,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."
66381,NaT,"[doc. MUDr. Kopečková Kateřina, Ph.D.\nC821 \n...",54659,2126585,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."
66382,NaT,[MUDr. Pacas Pavel\nC20 \nNO: 67-letá pacientk...,57487,562493,"[{'CDOKL': None, 'kod_vykonu': None, 'ODBORNOS...","[{'CDOKL': None, 'kod_materialu': None, 'mnozs..."


In [30]:
import math


result = []
for i in range(len(res)):
    contents = res.iloc[i]["contents"]
    vykony = res.iloc[i]["vykony"]
    materialy = res.iloc[i]["materialy"]

    input = {"report": "\n".join(contents)}
    if len(vykony) > 1:
        result.append(
            {
                "inputs": input,
                "outputs": {
                    "vykony": list(
                        {"code": x["kod_vykonu"], "body": x["BODY"]} for x in vykony
                    )
                },
            }
        )

evalset = pd.DataFrame(result)
evalset = evalset.sample(frac=1).reset_index(drop=True)[:50]

# assert len(training) + len(test) == len(evalset)

evalset


Unnamed: 0,inputs,outputs
0,"{'report': 'doc. MUDr. Kopečková Kateřina, Ph....","{'vykony': [{'code': 42022, 'body': '435.000'}..."
1,"{'report': 'doc. MUDr. Kopečková Kateřina, Ph....","{'vykony': [{'code': 42022, 'body': '435.000'}..."
2,{'report': 'MUDr. Casas Mendez Luis Fernando C...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."
3,{'report': 'MUDr. Danesh Adéla C19 Pacient je...,"{'vykony': [{'code': 42023, 'body': '435.000'}..."
4,{'report': 'MUDr. Čmejlová Vlastimila C504 KO...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."
5,{'report': 'MUDr. Šustrová Darja C509 Kontrol...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."
6,{'report': 'MUDr. Nohejlová Medková Anna C549...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."
7,{'report': 'MUDr. Bulová Alena C099 Pacient j...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."
8,"{'report': 'doc. MUDr. Kopečková Kateřina, Ph....","{'vykony': [{'code': 42022, 'body': '435.000'}..."
9,{'report': 'MUDr. Kryštofová Dominika C504 Pa...,"{'vykony': [{'code': 42022, 'body': '435.000'}..."


In [19]:
from dotenv import load_dotenv
from langsmith import Client

# Load environment variables from a .env file
load_dotenv()

client = Client()

In [31]:
client.create_dataset(
    dataset_name="rakathon-oncoders-hard",
)

Dataset(name='rakathon-oncoders-hard', description=None, data_type=<DataType.kv: 'kv'>, id=UUID('c0fecbb6-cf6e-4e28-b68a-ebfac991f5e0'), created_at=datetime.datetime(2025, 4, 13, 3, 0, 41, 838462, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2025, 4, 13, 3, 0, 41, 838462, tzinfo=datetime.timezone.utc), example_count=0, session_count=0, last_session_start_time=None, inputs_schema=None, outputs_schema=None, transformations=None)

In [32]:
from dotenv import load_dotenv
from langsmith import Client

client.create_examples(
    dataset_name="rakathon-oncoders-hard",
    examples=evalset.to_dict(orient="records"),
)


{'example_ids': ['3f048e3a-cd12-4906-996c-34f6dd85e907',
  'b075038b-e508-4cb9-87b6-f31acd7b5259',
  '0e06ab28-4c8f-4f5e-b099-4ba007ce03ef',
  '45e159d1-b70e-4a81-a984-15d2fac527e7',
  '97d47b13-02d9-4dc2-a6ef-9a357dc5fcd3',
  '11e763d0-c79d-4b69-844e-c1f36fdd0b3e',
  'e3407ab2-5c05-46c1-bfdf-f6f1de49b5ba',
  'cc054c16-f1b6-4bf3-be12-82802f3ecfd8',
  '9bd53515-98c3-4702-bb22-6e04cfe67b16',
  '339fb271-baf5-4bb1-bf25-5bcc7cd30f72',
  '2c3ecb1d-a526-43a1-891f-27f4ef6601fc',
  '934b0c57-1aa2-49b9-a5d8-7013008c8151',
  'dfd5bb54-830a-429f-9c01-1015df7c41df',
  '6a81d53f-27e0-44ab-94b2-527638cdacbc',
  '3a67e938-4890-40d2-b29b-015d5230cc09',
  'e1509e5f-a5a2-411e-8c87-92d86d5c249c',
  'd64f3220-20eb-44f2-a52c-1ffd8111bd82',
  '00014fb7-8e06-41d5-b8d9-326929bf03b9',
  'fa3406c3-2d5c-4470-a55f-2368c8469f8c',
  '88ad25c6-a3f5-4c2c-8c54-d0e10207236b',
  'f5d0f708-0b40-420f-8124-2153bf12ce66',
  'a5962ffe-2e9e-4e79-b61d-7436bce20708',
  '02347357-118d-4312-8387-7c482655e272',
  'f35c143b-5159-43

In [7]:
dokumentace

Unnamed: 0,FILE,POR,UDALOST_CISLO,UDALOST,DATUM,ODDEL,LEKAR,DATUM_CAS,DIAGNOZA,DATUM_OD,...,HODNOCENI,JMENO,RC,AMBNUM,CISPAC,PSC,DATUM_NAR,VEKR,VEKM,VEKD
0,2023_01,1,2,RATO,20230125,10248,22217,20230125_101800,,00010101_000000,...,0,,640XXXXXXX,53835,2124171,11000,10101,58,8,14
1,2023_01,2,5,RATP,20230125,10370,93786,20230125_102500,C19,00010101_000000,...,0,,716XXXXXXX,58891,2259382,28601,10101,51,3,14
2,2023_01,3,5,RATP,20230116,10370,22298,20230116_110000,C051,00010101_000000,...,0,,660XXXXXXX,55647,2193508,27201,10101,56,9,29
3,2023_01,4,5,RATP,20230104,10452,22422,20230104_112500,"C493, C810",00010101_000000,...,0,,685XXXXXXX,26033,508766,16000,10101,54,6,19
4,2023_01,5,5,RATP,20230105,10370,22298,20230105_110000,C162,00010101_000000,...,0,,660XXXXXXX,50413,298937,10000,10101,56,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79157,2023_12,5414,14,NAVS,20010904,0,0,20010904_000000,,00010101_000000,...,0,,670XXXXXXX,63433,2252314,26601,10101,34,1,11
79158,2023_12,5415,14,NAVS,20010904,0,0,20010904_000000,,00010101_000000,...,0,,675XXXXXXX,42308,1868487,35137,10101,34,2,16
79159,2023_12,5416,14,NAVS,20010904,0,0,20010904_000000,,00010101_000000,...,0,,675XXXXXXX,42308,1868487,35137,10101,34,2,16
79160,2023_12,5417,14,NAVS,20010904,0,0,20010904_000000,,00010101_000000,...,0,,391XXXXXXX,30856,1416262,26718,10101,61,9,26
