In [1]:
import pandas as pd
from utils import preprocess_data, get_matches, select_matches

In [2]:
#sub_data = pd.read_csv("C:/Substanzen/substances.csv", sep = ";", encoding="latin1")
#sub_data["Bezeichnung"] = sub_data["Bezeichnung"].replace({pd.NA: 'NA', '': 'NA'})
sub_data = pd.read_csv("C:/Substanzen/Test_Daten.csv", sep = ";", encoding="utf-8")
sub_data.head()

Unnamed: 0,Bezeichnung
0,Cisplatin
1,Etoposid
2,Carboplatin
3,Calciumfolinat
4,Avelumab


In [3]:
URL_LINK = "https://gitlab.opencode.de/robert-koch-institut/zentrum-fuer-krebsregisterdaten/cancerdata-references/-/raw/main/data/v2/Klassifikationen/substanz.csv?ref_type=heads"
reference_list = pd.read_csv(URL_LINK, sep=";")
reference_list.head()

Unnamed: 0,therapieart,substanz,code
0,HO,Abarelix,L02BX01
1,IM,Abatacept,L04AA24
2,ZS,Abemaciclib,L01EF03
3,IM,Abetimus,L04AA22
4,HO,Abirateron,L02BX03


In [4]:
col_with_substances = sub_data["Bezeichnung"]
col_with_ref_substances = reference_list["substanz"]

In [5]:
def create_service_variable(
    col_with_free_text: pd.Series,
    col_with_refs: pd.Series,
    threshold_parameter: int = 85,
    pattern_to_split: str = r"[/,;+]|\bund\b|\boder\b",
) -> pd.DataFrame:
    """applies all the function defined in the utils.py file

    Args:
        col_with_free_text (pd.Series): The column with text which should be scanned for substances
        col_with_refs (pd.Series): The column with substances that we want to search for in the text
        threshold_parameter (int, optional): Defines the accuracy, higher value means more accuracy.
        Defaults to 85.
        pattern_to_split (str, optional): Defines when more than one match is allowed
        Defaults to r"[/,;+]|\bund\b|\boder\b".

    Raises:
        ValueError: checks whether all IDs from input can be found in the output
        ValueError: checks whether the number of rows is the same in in- and output

    Returns:
        pd.DataFrame: processed df with original input text,
        matched substances and the corresponding accuracy score
    """
    preprocessed_data = preprocess_data(col_with_free_text)

    matches_df = get_matches(
        preprocessed_data, col_with_refs, threshold_parameter=threshold_parameter
    )

    selected_matches_df = select_matches(matches_df, pattern_to_split=pattern_to_split)

    if not preprocessed_data["ID"].isin(selected_matches_df["ID"]).all():
        raise ValueError("Not all IDs from input are in output")

    if len(preprocessed_data) != len(selected_matches_df):
        raise ValueError("Length of input and output differs")

    out_df = preprocessed_data.merge(selected_matches_df, on="ID", how="left")

    return out_df

In [6]:
substances_with_service_variable = create_service_variable(
        col_with_substances, col_with_ref_substances
    )
substances_with_service_variable.to_csv("output_AG_daten.csv", sep=";", index=False)

In [7]:
substances_with_service_variable.head()

Unnamed: 0,ID,Original,Preprocessed_text,input,match,matched_to,similarity
0,1,Cisplatin,Cisplatin,Cisplatin,Cisplatin,Cisplatin,100
1,2,Etoposid,Etoposid,Etoposid,Etoposid,Etoposid,100
2,3,Carboplatin,Carboplatin,Carboplatin,Carboplatin,Carboplatin,100
3,4,Calciumfolinat,folinsäure,folinsäure,folinsäure,Folinsäure,100
4,5,Avelumab,Avelumab,Avelumab,Avelumab,Avelumab,100


In [8]:
matches_counter = (substances_with_service_variable['matched_to'].notna() & (substances_with_service_variable['matched_to'] != "")).sum()
total_rows = len(substances_with_service_variable)
proportion = matches_counter / total_rows

print(proportion)

0.5935378767309258
