In [10]:
import numpy as np
import torch as torch
import matplotlib.pyplot as plt
import pickle
from sklearn.decomposition import PCA

# Função para associar cada categoria a um número inteiro, na ordem listada

In [11]:
def createCategoryIndex(categories):
    #create a dictionary for each category. Each category is a key of the dictionary and the element is 
    # an integer number. Each category corresponds to a number.
    category_index = {category: idx for idx,category in enumerate(categories)}
    
    #create a dictionary for each category. Each number is a key of the dictionary and the element is 
    # the corresponding category name.
    index_category = {idx: category for category, idx in category_index.items()}
    
    return category_index, index_category

# Carrega a lista de categorias

In [12]:
illnessNames = list(np.load('Categories/illnessNames.npy'))
illnessNames

['Arthritis',
 'Breathlessness',
 'Bronchitis',
 'Bursitis',
 'Conjunctivitis',
 'Dermatitis',
 'Epilepsy',
 'Esophagitis',
 'Essential tremor',
 'Fatty liver',
 'Gastritis',
 'Gastroesophageal reflux',
 'Glaucoma',
 'Gout',
 'Hiatus hernia',
 'Hypertriglyceridemia',
 'Hyperuricemia',
 'Inflammation of the tibial nerve',
 'Keratoconus',
 'Kidney failure',
 'Kidney stones',
 'Ligament problems',
 'Lumbar tumor',
 'Lymphedema of the lower members',
 'Migraine',
 'Poliomyelitis',
 'Polycystic kidney disease',
 'Progressive breast cancer',
 'Prostate disease',
 'Prostatitis',
 'Rheumatism',
 'Sickle cell anemia',
 'Sinusitis',
 'Skin cancer',
 'Skin disease',
 'Spine problem',
 'Stroke',
 'Varicose veins of the legs',
 'Vascular leakage of the eyes',
 'Anemia',
 'Arrhythmia',
 'Ascending colon cancer',
 'Deep vein thrombosis',
 'Depression',
 'Endometriosis',
 'Hashimoto disease',
 'Heel spurs',
 'Hepatitis B',
 'Intestine disorder',
 'Melanoma',
 "Parkinson's disease",
 'Prostatic hyperpl

# Carrega os vetores do banco de dados BioWordVec

In [19]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('wordEmbeddings/bio_embedding_extrinsic', binary=True)
# model = KeyedVectors.load_word2vec_format('wordEmbeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True)
print('São',len(model.vocab),'palavras')

São 2324849 palavras


Por exemplo, a palavra 'allergy' tem o seguinte vetor:

In [20]:
print(model['allergy'])

[ 0.25168544 -0.01934091 -0.40316743 -0.02268038  0.5688322   0.27722085
 -0.0312943  -0.48896465 -0.5185682   0.01411594  0.461625   -0.4290582
  0.00461783  0.01684239  0.17359051 -0.13645998 -0.05102435  0.01890971
  0.07020991 -0.16557398  0.16522539 -0.24913496 -0.3511754  -0.2106227
 -0.16456136 -0.40960917 -0.59124935  0.10545944  0.2148732  -0.21801609
  0.27340266  0.23580965 -0.04663253 -0.12203694  0.26754642 -0.05086051
  0.250879    0.45118695 -0.20331602 -0.41336206 -0.3653256  -0.22952196
  0.25110513 -0.41036364 -0.37013587 -0.02889536 -0.16636217  0.04712204
 -0.52487886 -0.14184186  0.20138535  0.2842646   0.34517968 -0.09384063
 -0.75165975 -0.5146247   0.16460857  0.07442839 -0.416404   -0.06224834
  0.00303303 -0.0937304   0.07938774 -0.05778766 -0.15059349  0.19243024
 -0.5404742   0.24347076  0.74062157 -0.5394397  -0.4908475  -0.03027103
  0.46166545  0.08245455  0.51724416 -0.11777794 -0.03820566  0.33912063
 -0.1630622  -0.27157554 -0.3743193  -0.48528415  0.2

Note que cada vetor é um vetor do Numpy com 200 números.

# Modifica os nomes da lista de categorias para adequar às palavras existentes no banco de palavras

Como algumas palavras da lista de categorias não está escrita igual ao banco BioWordVec, é necessário modificar algumas das palavras. Todas as palavras têm todas as letras minúsculas e os espaços são representados por hífens.

In [14]:
illnessNames = list(np.load('Categories/illnessNames.npy'))
illnessNamesModified = illnessNames.copy()

for i in range(len(illnessNames)):
    illnessNamesModified[i] = illnessNames[i].lower().replace(' ', '-')
    illnessNamesModified[i] = illnessNamesModified[i].replace('_total', '')
    illnessNamesModified[i] = illnessNamesModified[i].replace('hiatus-hernia', 'hernia-hiatal')
    illnessNamesModified[i] = illnessNamesModified[i].replace('inflammation-of-the-tibial-nerve', 'tarsal-tunnel-syndrome')
    illnessNamesModified[i] = illnessNamesModified[i].replace('ligament-problems', 'ligament-injured')
    illnessNamesModified[i] = illnessNamesModified[i].replace('lumbar-tumor', 'spinal-cord-neoplasms')
    illnessNamesModified[i] = illnessNamesModified[i].replace('lymphedema-of-the-lower-members', 'lymphedema')
    illnessNamesModified[i] = illnessNamesModified[i].replace('polycystic-kidney-disease', 'polycystic-kidney-diseases')
    illnessNamesModified[i] = illnessNamesModified[i].replace('progressive-breast-cancer', 'breast-cancer')
    illnessNamesModified[i] = illnessNamesModified[i].replace('prostate-disease', 'prostate-related')
    illnessNamesModified[i] = illnessNamesModified[i].replace('sickle-cell-anemia', 'anemia-sickle-cell')
    illnessNamesModified[i] = illnessNamesModified[i].replace('spine-problem', 'back-pain')
    illnessNamesModified[i] = illnessNamesModified[i].replace('varicose-veins-of-the-legs', 'varicose-veins')
    illnessNamesModified[i] = illnessNamesModified[i].replace('vascular-leakage-of-the-eyes', 'retinopathy')
    illnessNamesModified[i] = illnessNamesModified[i].replace('ascending-colon-cancer', 'colorectal-cancer')
    illnessNamesModified[i] = illnessNamesModified[i].replace('deep-vein-thrombosis', 'vein-thrombosis')
    illnessNamesModified[i] = illnessNamesModified[i].replace('heel-spurs', 'heel-spur')
    illnessNamesModified[i] = illnessNamesModified[i].replace('intestine-disorder', 'gastrointestinal-diseases')
    illnessNamesModified[i] = illnessNamesModified[i].replace("parkinson's-disease", 'parkinson-disease')
    illnessNamesModified[i] = illnessNamesModified[i].replace("thalassemia-minor", 'thalassemia')
    illnessNamesModified[i] = illnessNamesModified[i].replace('vasovagal-syncope', 'syncope-vasovagal')
    illnessNamesModified[i] = illnessNamesModified[i].replace('herniated-lumbar-disc', 'intervertebral-disc-displacement')
    illnessNamesModified[i] = illnessNamesModified[i].replace('chodromalacia-of-the-knees', 'chondromalacia-patellae')



# Guarda os vetores correspondentes a cada uma das categorias em uma matriz w do PyTorch 

Cada um dos vetores tem 200 dimensões. 

In [15]:
w = torch.zeros(len(illnessNames), 200)
for i in range(len(illnessNames)):
    w[i,:] =  torch.tensor(model[illnessNamesModified[i].lower()])
w


tensor([[ 0.4582,  0.2512, -0.2147,  ..., -0.2887, -0.1088,  0.0061],
        [ 0.0910,  0.4931, -0.0159,  ...,  0.3702,  0.1964, -0.0037],
        [ 0.6462,  0.5825, -0.1736,  ..., -0.0066,  0.1296,  0.4769],
        ...,
        [ 0.3149,  0.1600, -0.3912,  ...,  0.3093, -0.1735, -0.0782],
        [ 0.3069,  0.1590, -0.5044,  ...,  0.3318, -0.0114, -0.0812],
        [ 0.2178,  0.1325, -0.2937,  ...,  0.3365,  0.0918,  0.0417]])

# Redução de dimensionalidade de 200 para 3 utilizando PCA.

In [16]:
pca = PCA(n_components=3)
wIllness = torch.tensor(pca.fit_transform(w.numpy()))

# Relaciona cada categoria a um número

In [17]:
illness_index, index_illness = createCategoryIndex(illnessNames)

Por exemplo, a palavra 'Rheumatism' é associada a um número.

In [24]:
illness_index['Rheumatism']

30

Para saber o vetor de cada categoria, basta usar o index dentro da matriz w para achar o índice correspondente à aquela categoria. Por exemplo, o vetor para 'Rheumatism' é:

In [26]:
wIllness[illness_index['Rheumatism'],:]

tensor([ 1.2883, -0.6475,  0.2300])

# Salva a matriz de vetores para uso posterior

In [29]:
torch.save(wIllness, 'wordEmbeddings/illNessVecs.pt')

# Referências

Yijia Zhang, Qingyu Chen, Zhihao Yang, Hongfei Lin & Zhiyong Lu, [**BioWordVec, improving biomedical word embeddings with subword information and MeSH**](https://www.nature.com/articles/s41597-019-0055-0), Scientific Data (2019)
