## Code to run the embeddings and merge diagnosis and ICD-10 codes

In [1]:
!pip install pandas regex scikit-learn transformers torch google-cloud-storage



In [2]:
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import numpy as np

from google.colab import auth
from google.cloud import storage

from pandas import testing

In [3]:
# Authenticate with GCP
auth.authenticate_user()

In [4]:
# Initialize the GCS client
client = storage.Client()

In [5]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

In [7]:
# Download data from GCS
download_blob('scraped_data_clean', 'nhs/nhs_data_clean.json', './nhs_data_clean.json')
download_blob('scraped_data_clean', 'mayo_clinic/mayo_clinic_data_clean.json', './mayo_clinic_data_clean.json')
download_blob('icd_10_raw', 'icd10cm-codes-2024.csv', './icd10cm-codes-2024.csv')

In [9]:
# load the data sets already pre-processed and the ICD-10 data base
#change paths to the buckets!
db1 = pd.read_json('nhs_data_clean.json', lines=True)
db2 = pd.read_json('mayo_clinic_data_clean.json', lines=True)
icd10 = pd.read_csv('icd10cm-codes-2024.csv', header=None, names=['code', 'description'])

In [12]:
# function to remove upper cases and strange symbols

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters (keeping spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [13]:
# apply preprocessing to the diagnosis that will be embedded
db1['diagnosis'] = db1['diagnosis'].apply(preprocess_text)
db2['diagnosis'] = db2['diagnosis'].apply(preprocess_text)
icd10['description'] = icd10['description'].apply(preprocess_text)

In [14]:
# load tokenizer and model for bioBERT
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-v1.1')

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [15]:
# function to create the embeddings of the diagnosis and descriptions
def text_to_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    assert isinstance(embedding, np.ndarray)
    return embedding

In [None]:
# get embeddings
db1_embeddings = db1['diagnosis'].apply(text_to_embedding)
db2_embeddings = db2['diagnosis'].apply(text_to_embedding)
icd10_embeddings = icd10['description'].apply(text_to_embedding)

In [None]:
# create and embeddings column for future use in the dashboard
db1['diagnosis_embedding'] = db1_embeddings
db2['diagnosis_embedding'] = db2_embeddings
icd10['diagnosis_embedding'] = icd10_embeddings

In [None]:
# merge the two initial data sets
merged_db = pd.concat([db1, db2])

In [None]:
icd10_embeddings_array = np.vstack(icd10_embeddings.apply(lambda x: x.reshape(1, -1)).to_numpy())

In [None]:
# match ICD-10 codes using cosine similarity
# important! the embeddings must be a numpy array, otherwise it will not work
def match_icd10(diagnosis_embedding):
    similarities = cosine_similarity(diagnosis_embedding.reshape(1, -1), icd10_embeddings_array)[0]
    best_match_index = similarities.argmax()
    return icd10['code'][best_match_index]

In [None]:
merged_db['diagnosis_embedding'] = pd.concat([db1_embeddings, db2_embeddings])

In [None]:
merged_db['icd10_code'] = merged_db['diagnosis_embedding'].apply(match_icd10)

In [None]:
merged_db.drop(columns=['diagnosis_embedding'], inplace=True)

In [None]:
merged_db = pd.merge(merged_db, icd10[['code', 'description']], left_on='icd10_code', right_on='code', how='left')
merged_db.drop(columns=['code'], inplace=True)

In [None]:
# upload processed data back to GCS bucket
upload_blob('training_dataset_bloom', './symptoms_icd10.json', 'symptoms_icd10.json')
upload_blob('embedded_data', './mayo_clinic_data_clean_embed.json', 'mayo_clinic/mayo_clinic_data_clean_embed.json')
upload_blob('embedded_data', './nhs_data_clean_embed.json', 'nhs/nhs_data_clean_embed.json')
upload_blob('embedded_data', './icd10_embed.json', 'icd_10/icd10_embed.json')
