In [1]:
import numpy as numpy
import pandas as pandas
import json
import re
import ast


df = pandas.read_csv('inferencia.csv')
df['Farmacos'] = df['Farmacos'].apply(ast.literal_eval)

# Previous work

The previous step was the development of a entity recognition model to identify drugs on legal texts. The model was trained with SpaCy and the dataset resulting dataset is shown below.

The comumn "Farmacos" contains the drugs recognized by the model for each text.

In [36]:
df

Unnamed: 0,id,text,Farmacos
0,4031,.sentença nº. /2017 - tipo: bpje n º. 0804317-...,"[FRATURAS, FORTÉO, TERIPARATIDA DERIVADA DE, F..."
1,4032,processo nº: 0809729-87.2020.4.05.8100 - proce...,[]
2,4033,"pessoa humana, uma vez que se encontra intimam...",[SANDOSTATIN]
3,4034,sentença/2017 - tipo bprocesso: 0806234-74.201...,[ZYTIGA]
4,4035,aos municípios para o pagamento de ações e ser...,[PEGVISOMANTO]
...,...,...,...
395,4426,presença cumulativa dos seguintes requisitos: ...,"[VIDAZA, AZACITIDINA]"
396,4427,processo nº: 0802964-42.2016.4.05.8100 - proce...,"[RITUXIMABE, RITUXIMABE, MABTHERA]"
397,4428,poder judiciário justiça federal de primeiro g...,"[REVOLADE, ELTROMBOPAG, REVOLADE]"
398,4429,na constituição federal. o ponto turbulento re...,[ZYTIGA]


# Lets explore the labels


In [2]:
# Total number of drug recognized (not unique)
num_total_farmacos = df['Farmacos'].apply(len).sum()
num_total_farmacos

722

In [38]:
# Function to generate a list of unique drug
def lista_farmacos_unicos(farmacos):    
    
    # Iniciate a empty set to store the unique names
    farmacos_unic = set()

    # Iterate over each list in the list of lists
    for lista in farmacos:
        # Add all elements from the current list to the unique name set
        farmacos_unic.update(lista)

    # Convert the set of unique names back to a list (optional)
    farmacos_unic = list(farmacos_unic)
    return farmacos_unic

farmacos_unic = lista_farmacos_unicos(df['Farmacos'])

# Print the number of unique names
print(len(farmacos_unic))

# Print the inference labels 
farmacos_unic

185


['SOMATULINE AUTOGEL',
 'DALINVI',
 'DOXORRUBICINA',
 'ABIRATERONA',
 'NESTE CENÁRIO CLÍNICO',
 'LIPOSSOMAL',
 'INOTERSEN',
 'VENCLEXTA',
 'KYPROLIS',
 'FRATURAS',
 'BRENTUXIMAB VEDOTINA',
 'ACETATO DE OCTREOTIDA',
 'ILOPROST',
 'BELIMUMABE',
 'IBRANCE',
 'REVOLADE',
 'ESILATO DE NINTEDANIBE',
 'CYSTAGON',
 'STELARA',
 'CDB',
 'RCUI',
 'ACETATO ABIRATERONA',
 'KUVAN',
 'FORTEO',
 'TRASTUZUMABE',
 'REVLIMID',
 'METOTREXATO',
 'TEMOZOLAMINA',
 'JAVAKI',
 'REPLAGAL',
 'NATALIZUMABE',
 'PLERIXAFOR',
 'ADCETRIS',
 'DEXAMETASONA',
 'AFINITOR',
 'HERCEPTIN',
 'ENZALUTAMIDA',
 'ZYTIGA',
 'RANIBIZUMABE',
 'INTERFERON ALFA',
 'NEXAVAR',
 'ELETROMBOPAG OLAMINA',
 'USTEQUINUMABE',
 'MABTHERA',
 'ZOLADEX',
 'ILARIS',
 'OCREVUS',
 'ACETATO DE CIPROTERONA',
 'VERTEBRAIS',
 'ALECENSA',
 'GAZYVA',
 'MIGALASTAT',
 'FORTÉO',
 'EVEROLIMUS',
 'VENVANSE',
 'REPAGLAL',
 'ADALIMUMABE',
 'NUSINERSEN',
 'TEMOZOLOMIDA',
 'OPDIVO',
 'HUMIRA',
 'METILFENIDATO',
 'BRENTUXIMAB VEDOTIN',
 'VENETOCLAX',
 'TEMOZOLAMIDA

# Levenshtein Distance

We could see that we have 185 unique labels, with a total 722 labels recognized. A common problem on entity recognition is ambiguity, where the same entity has a different meaning depending on the context. In this case, we lead with something like the opposite. We have different labels that refer to the same drug. This situation happens because some texts contains the active ingredient, the commercial name or both. In the scenario of Business Intelligence, that is a problem. Besides that, there are always typing erros.  So, we are going to use the levenshtein distance to identify the variations of the same name and reduce them into only one.

In [39]:
import textdistance

# Function to calculate the Levenshtein distance between two elements
def levenshtein_distance(s1, s2):
    return textdistance.levenshtein.distance(s1, s2)

# Function to find pairs with smallest Levenshtein distance
def encontrar_pares_menor_distancia(lista):
    menor_distancia = float('inf')
    pares_menor_distancia = []

    for i in range(len(lista)):
        for j in range(i + 1, len(lista)):
            distancia = levenshtein_distance(lista[i], lista[j])
            if distancia < menor_distancia and distancia != 0:
                menor_distancia = distancia
                pares_menor_distancia = [(lista[i], lista[j])]
            elif distancia == menor_distancia:
                pares_menor_distancia.append((lista[i], lista[j]))
    return pares_menor_distancia

resultado = encontrar_pares_menor_distancia(farmacos_unic)

In [40]:
import itertools

#Function to remove duplicated pairs
def removeDuplicates(lst):
    lst.sort()
    grouped = itertools.groupby(lst)
    unique = [key for key,_ in grouped]
    return unique
 
resultado = removeDuplicates(resultado)
resultado

[('ADALIMUMABE', 'ADALIMIMABE'),
 ('BEVACIZUMABE', 'BEVACIZUMAB'),
 ('BRENTUXIMAB VEDOTIN', 'BRENTUXIMAB VEDITIN'),
 ('BRENTUXIMAB VEDOTIN', 'BRENTUXIMABE VEDOTIN'),
 ('BRENTUXIMAB VEDOTINA', 'BRENTUXIMAB VEDOTIN'),
 ('CETUXIMABE', 'CETUXIMAB'),
 ('ESILATO DE NINTEDANIBE', 'ESTILATO DE NINTEDANIBE'),
 ('FORTEO', 'FORTÉO'),
 ('NUSINERSEN', 'NUSINERSENA'),
 ('OBINUTUZUMAB', 'OBINUTUZUMABE'),
 ('OCTREOTIDE', 'OCTREOTIDA'),
 ('PAZOPANIB', 'PAZOPANIBE'),
 ('SANDOSTANTIN LAR', 'SANDOSTATIN LAR'),
 ('SUTENT', 'SUSTENT'),
 ('TEMOZOLAMINA', 'TEMOZOLAMIDA'),
 ('TEMOZOLOMIDA', 'TEMOZOLAMIDA'),
 ('XTADI', 'XTANDI'),
 ('ZYTIGA', 'ZITIGA')]

In [41]:
from collections import defaultdict

def agrupar_tuplas(tuplas):
    # Create a dictionary to map each node to the set of nodes connected to it
    graph = defaultdict(set)
    for tupla in tuplas:
        for elemento in tupla:
            graph[elemento].update(tupla)

    # Function to find all nodes connected to a given node
    def encontrar_conectados(nodo, visitados):
        visitados.add(nodo)
        for vizinho in graph[nodo]:
            if vizinho not in visitados:
                encontrar_conectados(vizinho, visitados)

    # Find groups of connected elements and group their corresponding tuples
    grupos = []
    visitados = set()
    for nodo in graph:
        if nodo not in visitados:
            grupo = set()
            encontrar_conectados(nodo, grupo)
            novo_grupo = [tupla for tupla in tuplas if any(elem in grupo for elem in tupla)]
            if novo_grupo not in grupos:
                grupos.append(novo_grupo)

    return grupos

tuplas_combinadas = agrupar_tuplas(resultado)

# Typing erros

Those are some variations of the same name. But how can we find the correct one? That is simple. We just need to compare to a certified document that contains the medicine registration. In this case, the anvisa table is consulted.

In [42]:
tuplas_combinadas

[[('ADALIMUMABE', 'ADALIMIMABE')],
 [('BEVACIZUMABE', 'BEVACIZUMAB')],
 [('BRENTUXIMAB VEDOTIN', 'BRENTUXIMAB VEDITIN'),
  ('BRENTUXIMAB VEDOTIN', 'BRENTUXIMABE VEDOTIN'),
  ('BRENTUXIMAB VEDOTINA', 'BRENTUXIMAB VEDOTIN')],
 [('CETUXIMABE', 'CETUXIMAB')],
 [('ESILATO DE NINTEDANIBE', 'ESTILATO DE NINTEDANIBE')],
 [('FORTEO', 'FORTÉO')],
 [('NUSINERSEN', 'NUSINERSENA')],
 [('OBINUTUZUMAB', 'OBINUTUZUMABE')],
 [('OCTREOTIDE', 'OCTREOTIDA')],
 [('PAZOPANIB', 'PAZOPANIBE')],
 [('SANDOSTANTIN LAR', 'SANDOSTATIN LAR')],
 [('SUTENT', 'SUSTENT')],
 [('TEMOZOLAMINA', 'TEMOZOLAMIDA'), ('TEMOZOLOMIDA', 'TEMOZOLAMIDA')],
 [('XTADI', 'XTANDI')],
 [('ZYTIGA', 'ZITIGA')]]

In [43]:
# Function to transform the list of tuple lists into a unique list for each tuple list
def transformar_lista(lista_de_tuplas):
    lista_final = []
    for sublist in lista_de_tuplas:
        elementos_unicos = set()
        for tupla in sublist:
            elementos_unicos.update(tupla)
        lista_final.append(list(elementos_unicos))
    return lista_final

listas_combinadas = transformar_lista(tuplas_combinadas)

In [44]:
listas_combinadas

[['ADALIMUMABE', 'ADALIMIMABE'],
 ['BEVACIZUMAB', 'BEVACIZUMABE'],
 ['BRENTUXIMAB VEDOTINA',
  'BRENTUXIMABE VEDOTIN',
  'BRENTUXIMAB VEDITIN',
  'BRENTUXIMAB VEDOTIN'],
 ['CETUXIMAB', 'CETUXIMABE'],
 ['ESILATO DE NINTEDANIBE', 'ESTILATO DE NINTEDANIBE'],
 ['FORTÉO', 'FORTEO'],
 ['NUSINERSEN', 'NUSINERSENA'],
 ['OBINUTUZUMABE', 'OBINUTUZUMAB'],
 ['OCTREOTIDA', 'OCTREOTIDE'],
 ['PAZOPANIBE', 'PAZOPANIB'],
 ['SANDOSTATIN LAR', 'SANDOSTANTIN LAR'],
 ['SUTENT', 'SUSTENT'],
 ['TEMOZOLAMIDA', 'TEMOZOLOMIDA', 'TEMOZOLAMINA'],
 ['XTADI', 'XTANDI'],
 ['ZITIGA', 'ZYTIGA']]

# Comparison with anvisa table

The anvisa table below contains the commercial name and the corresponding active ingredient

In [45]:
anvisa = pandas.read_csv('AVISA.csv', sep=';')
anvisa = anvisa[['nome_comercial','principio_ativo']]
anvisa.drop_duplicates(inplace=True)
anvisa

Unnamed: 0,nome_comercial,principio_ativo
0,BAYCUTEN N,21-ACETATO DE DEXAMETASONA;CLOTRIMAZOL
1,ORENCIA,ABATACEPTE
4,REOPRO,ABCIXIMABE
6,VERZENIOS,ABEMACICLIBE
14,CIBINQO,ABROCITINIBE
...,...,...
29312,ACCUVIT,ÓXIDO CÚPRICO;SELENATO DE SÓDIO;ACETATO DE RAC...
29313,SIMECO PLUS,ÓXIDO DE MAGNÉSIO;SIMETICONA;HIDRÓXIDO DE ALUM...
29314,VITAGLÓS,ÓXIDO DE ZINCO
29316,PRATIGLÓS,ÓXIDO DE ZINCO;RETINOL;COLECALCIFEROL


## Generating dictionary to map the correct commercial names

In [46]:
# Dictionary to store the results
mapping_nome_comercial = {}

# Loop to map the commercial names from the Anvisa DataFrame to the list of possible names
for nome_anvisa in anvisa['nome_comercial'].tolist():
    for lista in listas_combinadas:
        if nome_anvisa in lista:
            mapping_nome_comercial[nome_anvisa] = lista

## Generating dictionary to map the correct active ingredients

In [47]:
# Dictionary to store the results
mapping_principio_ativo = {}

# Loop to map the active ingredients from the Anvisa DataFrame to the list of possible names
for nome_anvisa in anvisa['principio_ativo'].tolist():
    for lista in listas_combinadas:
        if nome_anvisa in lista:
            mapping_principio_ativo[nome_anvisa] = lista

# Combining the two dictionaries
mapping_farmacos = mapping_nome_comercial | mapping_principio_ativo
mapping_farmacos

{'ZYTIGA': ['ZITIGA', 'ZYTIGA'],
 'XTANDI': ['XTADI', 'XTANDI'],
 'ESILATO DE NINTEDANIBE': ['ESILATO DE NINTEDANIBE',
  'ESTILATO DE NINTEDANIBE'],
 'SUTENT': ['SUTENT', 'SUSTENT'],
 'TEMOZOLOMIDA': ['TEMOZOLAMIDA', 'TEMOZOLOMIDA', 'TEMOZOLAMINA'],
 'FORTEO': ['FORTÉO', 'FORTEO'],
 'ADALIMUMABE': ['ADALIMUMABE', 'ADALIMIMABE'],
 'BEVACIZUMABE': ['BEVACIZUMAB', 'BEVACIZUMABE'],
 'CETUXIMABE': ['CETUXIMAB', 'CETUXIMABE'],
 'NUSINERSENA': ['NUSINERSEN', 'NUSINERSENA'],
 'OBINUTUZUMABE': ['OBINUTUZUMABE', 'OBINUTUZUMAB'],
 'OCTREOTIDA': ['OCTREOTIDA', 'OCTREOTIDE']}

### The following code is used to remove drugs that have one letter difference and are on the Anvisa list, indicating that there is no typing error

In [48]:
# Inverting the dictionary to have lists as keys and original keys as values
inverted_dict = {}
for chave, valores in mapping_farmacos.items():
    valores_tupla = tuple(valores)
    if valores_tupla not in inverted_dict:
        inverted_dict[valores_tupla] = [chave]
    else:
        inverted_dict[valores_tupla].append(chave)

# Removing duplicate lists
for valores, chaves in inverted_dict.items():
    if len(chaves) > 1:
        for chave in chaves[1:]:
            del mapping_farmacos[chave]

## Correction dictionary

Now we have the dict to fix the typing errors

In [49]:
dicionario_correcao = {}
for chave, valores in mapping_farmacos.items():
    for valor in valores:
        if valor != chave:
            dicionario_correcao[valor] = chave

dicionario_correcao

{'ZITIGA': 'ZYTIGA',
 'XTADI': 'XTANDI',
 'ESTILATO DE NINTEDANIBE': 'ESILATO DE NINTEDANIBE',
 'SUSTENT': 'SUTENT',
 'TEMOZOLAMIDA': 'TEMOZOLOMIDA',
 'TEMOZOLAMINA': 'TEMOZOLOMIDA',
 'FORTÉO': 'FORTEO',
 'ADALIMIMABE': 'ADALIMUMABE',
 'BEVACIZUMAB': 'BEVACIZUMABE',
 'CETUXIMAB': 'CETUXIMABE',
 'NUSINERSEN': 'NUSINERSENA',
 'OBINUTUZUMAB': 'OBINUTUZUMABE',
 'OCTREOTIDE': 'OCTREOTIDA'}

# Fixing the dataset labels

In [50]:
# Function to fix drug names according to the dictionary
def corrigir_medicamentos(lista_medicamentos):
    lista_corrigida = lista_medicamentos.copy()  # Create a copy of the list data
    for i, medicamento in enumerate(lista_corrigida):
        if medicamento in dicionario_correcao:
            lista_corrigida[i] = dicionario_correcao[medicamento]
    return lista_corrigida

# Applying the function to the DataFrame's medicine column
df['farmacos_corrigidos'] = df['Farmacos']
df['farmacos_corrigidos'] = df['farmacos_corrigidos'].apply(corrigir_medicamentos)


In [51]:
# Total number of labels in the dataset
num_total_farmacos_corrigidos = df['farmacos_corrigidos'].apply(len).sum()
num_total_farmacos_corrigidos

722

In [52]:
#Remove the empty labels lists
farmacos_unic = lista_farmacos_unicos(df['farmacos_corrigidos'])
#Print the number of unic labels
len(farmacos_unic)
#The number of unic labels reduced from 185 to 172 in consequence of the replacement of the names with spelling errors

172

In [53]:
dicionario_principios_ativos = anvisa.set_index('principio_ativo')['nome_comercial'].to_dict()

#Function to count the number of active ingredients in the dataset labels
def count_principios_ativos(df):
    principios_ativos = []
    for index,row in df.iterrows():
        for farmaco in row['farmacos_corrigidos']:
            if farmaco in dicionario_principios_ativos:
                principios_ativos.append(farmaco)
    return principios_ativos

principios_ativos = count_principios_ativos(df)
len(principios_ativos)

226

In [54]:
#Print the number of unique active ingredients
principios_ativos_unic = lista_farmacos_unicos(principios_ativos)
len(principios_ativos_unic)

22

## Replacement of the active ingredient with a commercial name using the Anvisa table

In [55]:
# Generating a dictionary to replace the active ingredient with the commercial name
dicionario_principios_ativos = anvisa.set_index('principio_ativo')['nome_comercial'].to_dict()

# Function to replace active ingredient in the drug column
def corrigir_farmacos(row):
    farmacos_corrigidos = []
    for farmaco in row['farmacos_corrigidos']:
        if farmaco in dicionario_principios_ativos:
            farmacos_corrigidos.append(dicionario_principios_ativos[farmaco])
        else:
            farmacos_corrigidos.append(farmaco)
    return farmacos_corrigidos

# Applying the correction to the ataset
df['farmacos_final'] = df.apply(corrigir_farmacos, axis=1)


In [56]:
#Print the number of unique labels
farmacos_unic = lista_farmacos_unicos(df['farmacos_final'])
#Remove os rótulos vazios
len(farmacos_unic)

149

# Results

The redunant labels were removed. The number of unique labels decreased from 185 to 149. That is almost 20%.

Now, the labels colunms contains only commercial names.

In [57]:
df

Unnamed: 0,id,text,Farmacos,farmacos_corrigidos,farmacos_final
0,4031,.sentença nº. /2017 - tipo: bpje n º. 0804317-...,"[FRATURAS, FORTÉO, TERIPARATIDA DERIVADA DE, F...","[FRATURAS, FORTEO, TERIPARATIDA DERIVADA DE, F...","[FRATURAS, FORTEO, TERIPARATIDA DERIVADA DE, F..."
1,4032,processo nº: 0809729-87.2020.4.05.8100 - proce...,[],[],[]
2,4033,"pessoa humana, uma vez que se encontra intimam...",[SANDOSTATIN],[SANDOSTATIN],[SANDOSTATIN]
3,4034,sentença/2017 - tipo bprocesso: 0806234-74.201...,[ZYTIGA],[ZYTIGA],[ZYTIGA]
4,4035,aos municípios para o pagamento de ações e ser...,[PEGVISOMANTO],[PEGVISOMANTO],[SOMAVERT]
...,...,...,...,...,...
395,4426,presença cumulativa dos seguintes requisitos: ...,"[VIDAZA, AZACITIDINA]","[VIDAZA, AZACITIDINA]","[VIDAZA, XPREZA]"
396,4427,processo nº: 0802964-42.2016.4.05.8100 - proce...,"[RITUXIMABE, RITUXIMABE, MABTHERA]","[RITUXIMABE, RITUXIMABE, MABTHERA]","[TRUXIMA, TRUXIMA, MABTHERA]"
397,4428,poder judiciário justiça federal de primeiro g...,"[REVOLADE, ELTROMBOPAG, REVOLADE]","[REVOLADE, ELTROMBOPAG, REVOLADE]","[REVOLADE, ELTROMBOPAG, REVOLADE]"
398,4429,na constituição federal. o ponto turbulento re...,[ZYTIGA],[ZYTIGA],[ZYTIGA]
