# Auto Vocab Mapping
___

## POC 1 - Vector Space Search

For the first POC I'll focus on source and target descriptions only. So I just need previously matched sources and targets.

In [None]:
import pandas as pd

Read CHUC example files and see what's in it

In [None]:
chuc_s_df = pd.read_csv("../lib/data/raw/source_codes_description/chuc/analises_cod_acto.csv")
chuc_s2c_df = pd.read_csv("../lib/data/raw/source_to_concept/chuc/source_to_standard_analises_cod_acto.csv")
concept = pd.read_csv("../lib/data/raw/vocabularies/CONCEPT.csv", low_memory=False)

In [None]:
concept.head()

In [None]:
concept['concept_id'].dtype

In [None]:
set_dtype = concept['concept_id'].dtype

Make dict to map quickly

In [None]:
target_dict = dict(zip(concept['concept_id'], concept['concept_name']))

From here I need concept_id and concept_name to map

In [None]:
chuc_s_df.head()

These are translations. We're not going into this for now. A separate exploration will be carried out for this topic alone. We could fine-tune our own medical data whichi has its specificities. We'll need: 
- Medical terms translation
- Acronym desambiguation

In [None]:
chuc_s2c_df.head()

From here I need the source code description and the target concept id. This is what well need in large quantities if we want to train a translator or a classifier. 

In [None]:
chuc_df = chuc_s2c_df[["source_code_description", "target_concept_id"]]
chuc_df.head()

### Map target concepts and check missing values

In [None]:
chuc_df.loc[:, 'concept_name'] = chuc_df['target_concept_id'].astype(set_dtype).map(target_dict)

In [None]:
chuc_df[chuc_df.isna().any(axis=1)]

In [None]:
chuc_s2c = chuc_df.dropna()

In [None]:
chuc_s2c.head()

In [None]:
sources = chuc_s2c["source_code_description"].tolist()
sources[:10]

In [None]:
targets = chuc_s2c["concept_name"].tolist()
targets[:10]

Some lm are trained as seq2seq and need the `query` and `passage` prefixes.

In [None]:
sources = [("query: " + i) for i in sources]
targets = [("query: " + i) for i in targets]

In [None]:
assert len(sources) == len(targets)

### Encode texts into fixed sized mean pooled vectors. 

Encode using torch. 

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import numpy as np


class TextEncoder:
    def __init__(self, model):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model)

    def encode(self, texts):
        # Tokenize the input texts
        batch_dict = self.tokenizer(texts,
                                    max_length=512,
                                    padding=True,
                                    truncation=True,
                                    return_tensors='pt')
        outputs = self.model(**batch_dict)
        embeddings = TextEncoder.__average_pool(
            outputs.last_hidden_state, batch_dict['attention_mask'])

        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        return np.array(embeddings.detach(), dtype=np.float32)

    @staticmethod
    def __average_pool(last_hidden_states: Tensor,
                       attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(
            ~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [None]:
model_name = 'intfloat/multilingual-e5-small'
embeddings = TextEncoder(model_name).encode(sources)

By default, sentence_transformers disables the parallelism to avoid any hidden deadlock that would be hard to debug

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('intfloat/multilingual-e5-small')
sources_emb = model.encode(sources, normalize_embeddings=True)
targets_emb = model.encode(targets, normalize_embeddings=True)

Sentence_transformer's implementation is faster than my manual approach so I'll stick to that. If in any case it has some incopatibility with a newer model I'll use mine. 

In [None]:
embeddings.shape

In [None]:
embeddings[:10]

Everything seems fine with the resulting vector space.

# PCA
Exploring projections in the vector space

In [None]:
from sklearn.decomposition import PCA

def compute_pca(vectors):
    pca = PCA()
    pca.fit(vectors)
    pcs = pca.transform(vectors)
    return pcs

In [None]:
import plotly.express as px

def plot_pca(pcs, colors, names, title='PCA'):
    fig = px.scatter_3d(x=pcs[:,0],
                    y=pcs[:,1],
                    z=pcs[:,2],
                    color=colors,
                    size_max=18,
                    opacity=0.7,
                    hover_name=names,
                    labels={
                        "x":"PC1",
                        "y":"PC2",
                        "z":"PC3"
                    })

    fig.update_layout(title=title)
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.update_layout(showlegend=False)
    fig.show()

In [None]:
stacked = np.vstack([sources_emb, targets_emb])
print(stacked.shape)

In [None]:
stacked_pcs = compute_pca(stacked)

In [None]:
# labels
names = sources + targets
sources_ids = ["source" for _ in sources]
targets_ids = ["target" for _ in targets]
group_names = sources_ids + targets_ids
# colors
color_by_group = sources_ids + targets_ids
individual_names = targets + targets

In [None]:
plot_pca(pcs=stacked_pcs, colors=color_by_group, names=group_names)

Clusters relate to the languages.

matches (sources - targets) should be closer if we color them the same

In [None]:
plot_pca(pcs=stacked_pcs[:20], colors=individual_names[:20], names=individual_names[:20])

In [None]:
source_dict = dict(zip(range(len(sources)), sources))
target_dict = dict(zip(range(len(targets)), targets))

In [None]:
rand_number = np.random.choice(len(sources), 1, replace=True)[0]
source_example = sources_emb[rand_number]

In [None]:
print(f' source: {source_dict[rand_number]};\n target: {target_dict[rand_number]}')

### Test distance: Compute nomalized L2 inner product

In [None]:
import faiss


def norml2_innerproduct(feature_space, query):

    index = faiss.index_factory(
        feature_space.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT)
    faiss.normalize_L2(feature_space)
    index.add(feature_space)
    distance, index = index.search(np.array([query]), k=feature_space.shape[0])

    return distance, index

In [None]:
distance, index = norml2_innerproduct(targets_emb, source_example)

In [None]:
print(f' source: {source_dict[rand_number]};\n target: {target_dict[index[0][0]]}')

In [None]:
index[0][0]

In [None]:
rand_number

In [None]:
top1 = 0
top5 = 0
top10 = 0
total = len(sources)
for i in range(total):
    source_example = sources_emb[i]
    distance, index = norml2_innerproduct(targets_emb, source_example)
    
    if i == index[0][0]:
        top1+=1
        top5+=1
        top10+=1
    elif i in index[0][:5]:
        top5+=1
        top10+=1
    elif i in index[0][:10]:
        top10+=1

    

In [None]:
print(f"""
      Top 1 match: {top1/total:.2%};
      Top 5 match: {top5/total:.2%};ok
      Top 10 match: {top10/total:.2%};
      Total number of tests: {len(sources)}
    """)

# Expand the number of examples


In [1]:
import sys
sys.path.insert(0, '..') # add parent folder path
from data_preprocessors import RawDataProcessor

In [2]:
hospital_folders = ["../lib/data/raw/source_to_concept/chuc/", "../lib/data/raw/source_to_concept/hds/"]
concept_vocab = "../lib/data/raw/vocabularies/CONCEPT.csv"

rdp = RawDataProcessor(vocab_file=concept_vocab, hospital_folders=hospital_folders)
sources, targets = rdp.join_source_target()

In [3]:
# sources, targets = rdp._prepare_4_encoding(sources, targets)

In [4]:
assert len(sources) == len(targets)

In [5]:
len(sources)

2222

In [6]:
source_dict = dict(zip(range(len(sources)), sources))
target_dict = dict(zip(range(len(targets)), targets))

In [30]:
import pickle 
with open('../lib/artifacts/dicts/sources.pickle', 'wb') as handle:
    pickle.dump(source_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../lib/artifacts/dicts/targets.pickle', 'wb') as handle:
    pickle.dump(target_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Selected models and why

In [16]:
list_of_models = [ 
                  "mixedbread-ai/mxbai-embed-large-v1",
                  'intfloat/multilingual-e5-small',
                  "intfloat/multilingual-e5-base", 
                  "intfloat/multilingual-e5-large", 
                  "sentence-transformers/all-MiniLM-L6-v2", 
                  # "Henrychur/MMedLM2", # too large for now
                  "medicalai/ClinicalBERT"
]

In [8]:
from tqdm import tqdm
from time import time
from sentence_transformers import SentenceTransformer


def test_models(models: list, sources: list, targets: list):

    # Store results
    results_df = []
    results = []

    for plm in tqdm(models, desc="Testing models: "):

        # Load model
        needs_remote_code = 0
        try:
            model = SentenceTransformer(plm, trust_remote_code=False)
        except ValueError:
            model = SentenceTransformer(plm, trust_remote_code=True)
            needs_remote_code = 1
        
        for query_prefix in ['', 'query: ']:

            mod_sources = [(query_prefix + i) for i in sources]
            mod_targets = [(query_prefix + i) for i in targets]

            # Track results
            top1 = 0
            top5 = 0
            top10 = 0
            total = len(sources)

            # Encode
            sources_emb = model.encode(mod_sources, normalize_embeddings=True)
            targets_emb = model.encode(mod_targets, normalize_embeddings=True)

            # Track Encoding Time
            start = time()
            for i in tqdm(range(total), leave=False):

                # Compute distances
                source_example = sources_emb[i]
                distance, index = norml2_innerproduct(targets_emb, source_example)

                # Check matches
                if i == index[0][0]:
                    top1 += 1
                    top5 += 1
                    top10 += 1
                elif i in index[0][:5]:
                    top5 += 1
                    top10 += 1
                elif i in index[0][:10]:
                    top10 += 1

            # Compute time
            end = time()
            elapsed_seconds = end - start

            results_df.append(
                {   
                    "plm": plm + '__query_prefix__' + query_prefix,
                    "remote_code": needs_remote_code,
                    "Top-1 match": top1/total,
                    "Top-5 match": top5/total,
                    "Top-10 match": top10/total,
                    "Total number of tests": len(sources),
                    "Elapsed seconds": elapsed_seconds,
                    "Predictions per second X 1000": len(sources)/elapsed_seconds/1000
                }
            )

            results.append(f"""
                            plm: {plm + '__query_prefix__' + query_prefix};
                            needs remote code: {needs_remote_code};
                            Top 1 match: {top1/total:.2%};
                            Top 5 match: {top5/total:.2%};
                            Top 10 match: {top10/total:.2%};
                            Total number of tests: {len(sources)},
                            Elapsed seconds: {elapsed_seconds};
                            Predictions per second X 1000: {len(sources)/elapsed_seconds/1000:.2}
                            """)
        
    [print(_) for _ in results]
    
    return results_df

In [17]:
results_df = test_models(list_of_models, sources, targets)

Testing models:  83%|████████▎ | 5/6 [02:19<00:25, 25.05s/it]No sentence-transformers model found with name medicalai/ClinicalBERT. Creating a new one with MEAN pooling.
Testing models: 100%|██████████| 6/6 [02:32<00:00, 25.34s/it]


                            plm: mixedbread-ai/mxbai-embed-large-v1__query_prefix__;
                            needs remote code: 0;
                            Top 1 match: 47.97%;
                            Top 5 match: 75.92%;
                            Top 10 match: 82.40%;
                            Total number of tests: 2222,
                            Elapsed seconds: 2.1125829219818115;
                            Predictions per second X 1000: 1.1
                            

                            plm: mixedbread-ai/mxbai-embed-large-v1__query_prefix__query: ;
                            needs remote code: 0;
                            Top 1 match: 46.71%;
                            Top 5 match: 73.54%;
                            Top 10 match: 80.69%;
                            Total number of tests: 2222,
                            Elapsed seconds: 2.0437729358673096;
                            Predictions per second X 1000: 1.1
                          




In [18]:
usagis = {"plm": 'USAGI', "Top-1 match": 0.42, "Top-5 match": 0.58, "Top-10 match": 0.62} # From toki paper
results_df.append(usagis)

In [19]:
import pandas as pd
res_df = pd.DataFrame.from_dict(results_df)
res_df

Unnamed: 0,plm,remote_code,Top-1 match,Top-5 match,Top-10 match,Total number of tests,Elapsed seconds,Predictions per second X 1000
0,mixedbread-ai/mxbai-embed-large-v1__query_pref...,0.0,0.479748,0.759226,0.824032,2222.0,2.112583,1.051793
1,mixedbread-ai/mxbai-embed-large-v1__query_pref...,0.0,0.467147,0.735374,0.806931,2222.0,2.043773,1.087205
2,intfloat/multilingual-e5-small__query_prefix__,0.0,0.484248,0.714671,0.775878,2222.0,1.193867,1.861179
3,intfloat/multilingual-e5-small__query_prefix__...,0.0,0.479748,0.718722,0.784878,2222.0,1.187206,1.871622
4,intfloat/multilingual-e5-base__query_prefix__,0.0,0.471647,0.710171,0.777678,2222.0,2.076834,1.069898
5,intfloat/multilingual-e5-base__query_prefix__q...,0.0,0.469397,0.693069,0.763726,2222.0,2.049134,1.084361
6,intfloat/multilingual-e5-large__query_prefix__,0.0,0.50045,0.743024,0.805581,2222.0,2.070066,1.073396
7,intfloat/multilingual-e5-large__query_prefix__...,0.0,0.509001,0.733123,0.792979,2222.0,2.759984,0.805077
8,sentence-transformers/all-MiniLM-L6-v2__query_...,0.0,0.414491,0.682268,0.754275,2222.0,1.145369,1.939986
9,sentence-transformers/all-MiniLM-L6-v2__query_...,0.0,0.39694,0.654365,0.725023,2222.0,1.155707,1.922632


In [25]:
res_df = res_df.loc[
    (res_df['Top-1 match'] >= usagis['Top-1 match']) &
    (res_df['Top-5 match'] >= usagis['Top-5 match']) &
    (res_df['Top-10 match'] >= usagis['Top-10 match']), :]

res_df

Unnamed: 0,plm,remote_code,Top-1 match,Top-5 match,Top-10 match,Total number of tests,Elapsed seconds,Predictions per second X 1000
0,mixedbread-ai/mxbai-embed-large-v1__query_pref...,0.0,0.479748,0.759226,0.824032,2222.0,2.112583,1.051793
1,mixedbread-ai/mxbai-embed-large-v1__query_pref...,0.0,0.467147,0.735374,0.806931,2222.0,2.043773,1.087205
2,intfloat/multilingual-e5-small__query_prefix__,0.0,0.484248,0.714671,0.775878,2222.0,1.193867,1.861179
3,intfloat/multilingual-e5-small__query_prefix__...,0.0,0.479748,0.718722,0.784878,2222.0,1.187206,1.871622
4,intfloat/multilingual-e5-base__query_prefix__,0.0,0.471647,0.710171,0.777678,2222.0,2.076834,1.069898
5,intfloat/multilingual-e5-base__query_prefix__q...,0.0,0.469397,0.693069,0.763726,2222.0,2.049134,1.084361
6,intfloat/multilingual-e5-large__query_prefix__,0.0,0.50045,0.743024,0.805581,2222.0,2.070066,1.073396
7,intfloat/multilingual-e5-large__query_prefix__...,0.0,0.509001,0.733123,0.792979,2222.0,2.759984,0.805077
12,USAGI,,0.42,0.58,0.62,,,


In [26]:
import plotly.graph_objs as go

def parallel(df):
    df = df.fillna(0)
    df['dummy'] = df.reset_index().index
    dimensions = list([
                dict(range = [min(df['Top-1 match']),max(df['Top-1 match'])],
                    constraintrange = [min(df['Top-1 match']),max(df['Top-1 match'])],
                    label = 'Top-1 match', values = df['Top-1 match']),
                dict(range = [min(df['Top-5 match']),max(df['Top-5 match'])],
                    constraintrange = [min(df['Top-5 match']),max(df['Top-5 match'])],
                    label = 'Top-5 match', values = df['Top-5 match']),
                dict(range = [min(df['Top-10 match']),max(df['Top-10 match'])],
                    constraintrange = [min(df['Top-10 match']),max(df['Top-10 match'])],
                    label = 'Top-10 match', values = df['Top-10 match']),
                dict(range = [min(df['Predictions per second X 1000']),max(df['Predictions per second X 1000'])],
                    constraintrange = [min(df['Predictions per second X 1000']),max(df['Predictions per second X 1000'])],
                    label = 'Predictions per second X 1000', values = df['Predictions per second X 1000']),
                dict(range=[df['dummy'].min(),df['dummy'].max()],
                       tickvals = df['dummy'], ticktext = df['plm'],
                       label='plm', values=df['dummy']),
                  ])

    fig = go.Figure(data=go.Parcoords(line = dict(color = df['dummy'], colorscale=['rgba(99,110,250,0.9)',
    'rgba(239,85,59,0.9)',
    'rgba(0,204,150,0.9)',
    'rgba(171,99,250,0.9)',
    'rgba(255,161,90,0.9)',
    'rgba(25,211,243,0.9)',
    'rgba(255,102,146,0.9)',
    'rgba(182,232,128,0.9)',
    'rgba(255,151,255,0.9)',
    'rgba(254,203,82,0.9)']
                    ), dimensions=dimensions))
    
    fig.update_layout(
        font=dict(
            family="Sans-serif",
            size=13,
            color="Black"
        )
    )
    
    fig.show()

In [None]:

parallel(res_df[1:])         