In [None]:
import spacy
from spaczz.matcher import FuzzyMatcher
import pandas as pd
import Levenshtein
import re
import string

In [11]:
sub_data = pd.read_csv("C:/Substanzen/substances.csv", sep = ";", encoding="latin1")
#sub_data["Bezeichnung"] = sub_data["Bezeichnung"].replace({pd.NA: 'NA', '': 'NA'})
ref_tab = pd.read_csv("C:/Substanzen/substanz_referenz.csv", sep = ";", encoding="latin1")
ref_tab.head()

Unnamed: 0,Therapieart,Substanz,Code
0,HO,Abarelix,L02BX01
1,IM,Abatacept,L04AA24
2,ZS,Abemaciclib,L01EF03
3,IM,Abetimus,L04AA22
4,HO,Abirateron,L02BX03


In [9]:
sub_data.head()

Unnamed: 0,idx,text,cnt,atc,created_on,ID
0,0,allogene SZT,1,,23.10.2023 10:30,1
1,1,"SIOP 2001/GPOH, Version 3",2,,23.10.2023 10:30,2
2,2,Bleomycinsulfat,1,,23.10.2023 10:30,3
3,3,Ganzkörperbestrahlung (o.n.A.),6,,23.10.2023 10:30,4
4,4,Buserelinacetat,3,,23.10.2023 10:30,5


In [12]:
col_with_refs = ref_tab["Substanz"]
col_with_free_text = sub_data["text"]

In [None]:
def prepare_free_text(input_col):
    """prepares data, i.e., correct column names and deals with NA and empty strings

    Args:
        input_col (PandasSeries): column with free text for substances
    """
    input_data = pd.DataFrame({
        "ID": range(1, len(input_col) + 1),
        "Original":input_col

    })
    input_data["Original"] = input_data["Original"].replace({pd.NA: 'NA', '': 'NA'})

    return(input_data)

def remove_short_words(s):
    """removes words with less than 3 characters

    Args:
        s (string): string from free text field

    Returns:
        string: input string without short words
    """
    words = [word for word in s.split() if len(word) >= 3]
    out = " ".join(words)
    return out


def remove_unwanted_words(s):
    """removes common words that we dont want for string matching

    Args:
        s string: string from free text field

    Returns:
        string: input string without unwanted words
    """
    unwanted_words_pattern = (
        r"wöchentlich|weekly|woche|allgemein|entsprechend|beendet|zyklus|version|"
        r"bis|mg|kg|m2|bezeichnet|entfällt|o.n.a.|o.n.a|i.v.|i.v"
    )
    s = re.sub(unwanted_words_pattern, "", s, flags=re.IGNORECASE)
    return s


def find_5FU(s):
    """5FU is a common abbreviation for Fluorouracil. The functions finds it and replaces it with the full name.

    Args:
        s (string): input string from free text field

    Returns:
        string: Same string or string with 5-FU replaced by full name
    """
    fluorouracil_pattern = (
        r"5 fu|5fu|5-fu|5_fu|Fluoruracil|flourouracil|5-fluoruuracil|"
        r"5-fluoro-uracil|5-fluoruuracil|5-fluoruracil|floururacil|"
        r"5-fluorounacil|flourouraci|5-fluourouracil"
    )
    s = re.sub(fluorouracil_pattern, "fluorouracil", s, flags=re.IGNORECASE)
    return s

def calciumfolinat_to_folin(s):
    """Often it is reported <Folinsaure (Calciumfolinat)>
       to prevent mismatches with calciumfolinat, it is translated
       to folinsaure 
    Args:
        s (string): input string from free text field
    """    
    calcium_pattern = r"\b(Calciumfolinat)\b"
    s = re.sub(calcium_pattern, "folinsäure", s, flags=re.IGNORECASE)
    return s

def find_gemcitabin(s):
    """To fix common typos for Gemcitabin

    Args:
        s (string): input string from free text field

    Returns:
        string: Same string or string with fixed typo
    """
    gemcitabin_pattern = r"Gemcibatin|Gemcibatine|Gemcibatine Mono|Gemcibatin Mono"
    s = re.sub(gemcitabin_pattern, "gemcitabin", s, flags=re.IGNORECASE)
    return s

def find_Paclitaxel_nab(s):
    """To fix Paclitaxel nab is named as nab-Paclitaxel

    Args:
        s (string): input string from free text field

    Returns:
        string: Same string or string with fixed typo
    """
    Paclitaxel_pattern = r"nab-Paclitaxel|nabPaclitaxel"
    s = re.sub(Paclitaxel_pattern, "Paclitaxel_nab", s, flags=re.IGNORECASE)
    return s


def remove_special_symbols(s):
    """removes common symbols that hinder matching

    Args:
        s (string): input string from free text field

    Returns:
        string: Same string without symbols
    """
    special_symbols_pattern = r"[\u24C0-\u24FF\u2100-\u214F\u2200-\u22FF\u2300-\u23FF\u2600-\u26FF\u2700-\u27BF\u2B50\u2B06]|m²"

    return re.sub(special_symbols_pattern, "", s)

def remove_trailing_leading_characters(s):
    
    remove_trailings = s.rstrip(",").rstrip(";")
    remove_leadings = remove_trailings.lstrip(",").lstrip(";")
    no_whitepace = remove_leadings.strip()

    return no_whitepace


def preprocess_data(col_with_free_text):

    df = prepare_free_text(col_with_free_text)
    remove_words_col = df["Original"].apply(remove_unwanted_words)
    find_FU_col = remove_words_col.apply(find_5FU)
    find_gemcitabin_col = find_FU_col.apply(find_gemcitabin)
    find_paclitaxel_col = find_gemcitabin_col.apply(find_Paclitaxel_nab)
    remove_short_words_col = find_paclitaxel_col.apply(remove_short_words)
    preprocessed_col = remove_short_words_col.apply(remove_special_symbols)

    df["Preprocessed_text"] = preprocessed_col.apply(remove_trailing_leading_characters)

    return(df)



In [30]:
my_test_data = preprocess_data(col_with_free_text)
my_test_data.head()

Unnamed: 0,ID,Original,Preprocessed_text
0,1,allogene SZT,allogene SZT
1,2,"SIOP 2001/GPOH, Version 3",SIOP 2001/GPOH
2,3,Bleomycinsulfat,Bleomycinsulfat
3,4,Ganzkörperbestrahlung (o.n.A.),Ganzkörperbestrahlung
4,5,Buserelinacetat,Buserelinacetat


In [96]:
def get_matches(substance_df, ref_substance, threshold_parameter = 85):

    nlp = spacy.blank("en")
    matcher = FuzzyMatcher(nlp.vocab)
    matcher.add("Substance", [nlp(str(sub)) for sub in ref_substance])

    results = []
    for _, row in substance_df.iterrows():
        text = row["Preprocessed_text"]
        id_num = row["ID"]

        doc = nlp(str(text))
        matches = matcher(doc)

        match_found = False

        for match_id, start, end, ratio, pattern in matches:
            if ratio > threshold_parameter:
                results.append({
                    "ID": id_num,
                    "input": text,
                    "match": doc[start:end].text,
                    "matched_to": pattern,
                    "similarity": ratio
                })
                match_found = True

        if not match_found:
            results.append({
                "ID": id_num,
                "input": text,
                "match": "",
                "matched_to": "",
                "similarity": ""
            })

    results_df = pd.DataFrame(results).sort_values(by = "ID", ascending = True)
    
    return results_df


In [97]:

matches_found_df = get_matches(my_test_data, col_with_refs, threshold_parameter=85)
matches_found_df.to_csv("output_matches.csv", sep= ";")
matches_found_df.head()

Unnamed: 0,ID,input,match,matched_to,similarity
0,1,allogene SZT,,,
1,2,SIOP 2001/GPOH,,,
2,3,Bleomycinsulfat,,,
3,4,Ganzkörperbestrahlung,,,
4,5,Buserelinacetat,,,


In [None]:
def select_best_rows(group):
    
    exact = group[group['exact_match'] == 1]
    if not exact.empty:
        return exact.iloc[0]

    
    detected = group[group['detected_match'] == 1]
    if not detected.empty:
        return detected.iloc[0]

    return group.loc[group['LV_distance'].idxmin()]

def select_matches(matches_found_df, pattern_to_split = r"[/,;+]|\bund\b|\boder\b"):

    df_with_pattern = matches_found_df[~matches_found_df["input"].str.contains(
        pattern_to_split, case=False, regex=True, na=False)].copy()
    
    df_with_pattern["match_count"] = df_with_pattern.groupby("ID")["ID"].transform("count")
    select_df = df_with_pattern[df_with_pattern['match_count'] > 1].sort_values(by = "ID", ascending = True)
    select_df['exact_match'] = (select_df['input'].astype(str) == select_df['matched_to'].astype(str)).astype(int)
    
    select_df['detected_match'] = select_df.apply(        
        lambda row: str(row['input']).lower() in str(row['matched_to']).lower(), axis=1).astype(int)
    
    select_df['LV_distance'] = select_df.apply(lambda row: Levenshtein.distance(str(row['input']), str(row['matched_to'])), axis=1) 
    best_matches = select_df.groupby('ID')[select_df.columns.tolist()].apply(select_best_rows, include_groups=True).reset_index(drop=True)
    selected_matches = best_matches[matches_found_df.columns.tolist()].copy()
    
    subset_df1 = matches_found_df[~matches_found_df['ID'].isin(selected_matches['ID'])]

    results_df = pd.concat([subset_df1, selected_matches], ignore_index=True)

    collapsed_df = (
        results_df.groupby("ID").agg({
            "input": "first",
            "match": lambda x: "; ".join(x.dropna().astype(str)),
            "matched_to": lambda x: "; ".join(dict.fromkeys(x.dropna().astype(str))),
            "similarity": lambda x: "; ".join(dict.fromkeys(x.dropna().astype(str))),
        }).reset_index()
    )
    
    return collapsed_df.sort_values(by = "ID", ascending = True)

In [101]:
test = select_matches(matches_found_df)

In [107]:
def create_service_variable(col_with_free_text, col_with_refs,
                            threshold_parameter = 85, pattern_to_split = r"[/,;+]|\bund\b|\boder\b"):
    
    preprocessed_data = preprocess_data(col_with_free_text)
    
    matches_df = get_matches(preprocessed_data, col_with_refs, threshold_parameter = threshold_parameter)
    
    selected_matches_df = select_matches(matches_df, pattern_to_split = pattern_to_split)
    
    if not preprocessed_data['ID'].isin(selected_matches_df['ID']).all():
        raise ValueError("Not all IDs from input are in output")
    
    if len(preprocessed_data) != len(selected_matches_df):
        raise ValueError("Length of input and output differs")

    out_df = my_test_data.merge(selected_matches_df, on="ID", how='left')

    return out_df

test = create_service_variable(col_with_free_text, col_with_refs)
test.head()


Unnamed: 0,ID,Original,Preprocessed_text,input,match,matched_to,similarity
0,1,allogene SZT,allogene SZT,allogene SZT,,,
1,2,"SIOP 2001/GPOH, Version 3",SIOP 2001/GPOH,SIOP 2001/GPOH,,,
2,3,Bleomycinsulfat,Bleomycinsulfat,Bleomycinsulfat,,,
3,4,Ganzkörperbestrahlung (o.n.A.),Ganzkörperbestrahlung,Ganzkörperbestrahlung,,,
4,5,Buserelinacetat,Buserelinacetat,Buserelinacetat,,,


In [109]:
matches_counter = (test['matched_to'].notna() & (test['matched_to'] != "")).sum()
total_rows = len(test)
proportion = matches_counter / total_rows

print(proportion)

0.6585051546391752
