In [1]:
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from inflect import engine
import csv
import ast
import re

model = BertModel.from_pretrained("dmis-lab/biobert-v1.1")
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1") #tokenizes subwords instead of words

### Sentence Embeddings Function
returns a list of embeddings for every token in the sentence

### Word Embeddings Function
given a sentence and a word in the sentence:

tokenizes the term, then finds position of term tokens in sentence, and returns the pool of embeddings. Can choose between averaging and maxing for pooling.

In [3]:
# Function to calculate sentence embeddings
def calculate_sentence_embedding(tokens):

    # Convert tokens to numerical ids
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Truncate input sequence to a maximum length of 512 tokens
    if len(input_ids) > 512:
        input_ids = input_ids[:512]

    inputs = torch.tensor([input_ids])

    # Run ids through model to generate word embeddings
    outputs = model(inputs)
    word_embeddings = outputs.last_hidden_state

    return word_embeddings

# Given a sentence and a word, return the context-dependent embeddings
def word_embedding(sent, word, pooling='avg'):
    tokens = tokenizer.tokenize(sent)

    embeddings = calculate_sentence_embedding(tokens)
    term_split = tokenizer.tokenize(word)

    pool = []
    for t in term_split: # t are the subwords of a term
        t_index = tokens.index(t)  # Find the position of the subword in tokens
        t_embed = embeddings[0, t_index]
        pool.append(t_embed)

    # Convert embeddings to numpy
    pool_np = np.array([embedding.detach().numpy() for embedding in pool])

    # Compute the average or max of each feature along the first axis (embeddings)
    if pooling == 'max':
        average_features = torch.tensor(np.max(pool_np, axis=0))
    else:
        average_features = torch.tensor(np.mean(pool_np, axis=0))

    return average_features


### Find embeddings of medical term in context of definition
Kind of a shortcut, but concatanate the term and definition into a sentence of the form `"{definition} is {term}."`
Then, find the embedding of the term in the context of the sentence. Saved to term_embeddings.csv

In [21]:
input_file = 'inputs/definitions_POSNew.csv'
output_file = 'inputs/term_embeddings.csv'

# Read definitions from input file and calculate embeddings
definitions = []
with open(input_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        term = row[0]
        definition = row[1]
        sentence = f"{definition} is {term}."

        tokens = tokenizer.tokenize(sentence)
        embeddings = calculate_sentence_embedding(tokens) 
        
        term_split = tokenizer.tokenize(term)
        print('generating embeddings for',term_split)
        
        average_tensor = word_embedding(sentence,term,'max')
        
        definitions.append([term, average_tensor])

# Save embeddings to output file
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Term', 'Embedding'])
    for term, embedding in definitions:
        writer.writerow([term, embedding.tolist()])


generating embeddings for ['5', '-', 'alpha', 'red', '##uc', '##tase']
generating embeddings for ['5', '-', 'alpha', 'red', '##uc', '##tase', '##s']
generating embeddings for ['abdominal', 'muscles']
generating embeddings for ['abdominal', 'muscle']
generating embeddings for ['a', '##b', '##dom', '##ino', '##p', '##last', '##y']
generating embeddings for ['a', '##b', '##dom', '##ino', '##p', '##last', '##ies']
generating embeddings for ['a', '##b', '##duction']
generating embeddings for ['a', '##b', '##duction', '##s']
generating embeddings for ['a', '##bla', '##tion']
generating embeddings for ['a', '##bla', '##tions']
generating embeddings for ['a', '##bra', '##sion']
generating embeddings for ['a', '##bra', '##sions']
generating embeddings for ['a', '##bs', '##cess']
generating embeddings for ['a', '##bs', '##cess', '##es']
generating embeddings for ['a', '##but', '##ment']
generating embeddings for ['a', '##but', '##ments']
generating embeddings for ['acceptance', '-', 'based', 'th

### Cosine Similarity Example

In [10]:
sent1 = "he will try and go to the hospital"
sent2 = "I asked him, but I'm not sure if he's trying to go to the event"

embed1 = word_embedding(sent1,'go','max')
embed2 = word_embedding(sent2,'event','max')


cossim = torch.cosine_similarity(embed1.reshape(1,-1), embed2.reshape(1,-1))

if(cossim[0] > 0.70): print('above threshold:',cossim[0].detach())
else: print('below threshold:',cossim[0].detach())


below threshold: tensor(0.6662)


POS tag terms

In [11]:
# Open the CSV file for reading and writing
with open('inputs/definitions.csv', 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)

    # Iterate through each row, perform POS tagging, and update the third column
    for row in rows:
        term = row[0].split()[-1]  # Get the term from the first column
        tokens = word_tokenize(term)
        tagged_tokens = pos_tag(tokens)

        # Update the third column with the tagged term
        row[2] = tagged_tokens[0][1]


# Write the updated rows back to the CSV file
with open('inputs/definitions_POS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)


Using POS tagged terms, Duplicate Definitions for both plural and singular versions of terms.

In [19]:


# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
plurals_engine = engine()

# Open the CSV file for reading and writing
with open('inputs/definitions_POS.csv', 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)
    updated_rows = []

    # Iterate through each row
    for row in rows:
        term = row[0]  # Get the term from the first column
        last_word = term.split()[-1]
        definition = row[1]  # Get the definition from the second column
        tag = row[2]  # Get the POS-tagged term from the third column

        # Add the original row to the updated rows list
        updated_rows.append(row)
        
        # Check if the POS tag is "NNS"
        if tag == "NNS":
            # Singularize the term
            
            singular = lemmatizer.lemmatize(last_word, pos='n')
            
            # Create a new row with the singular term, same definition, and POS tag "NN"
            updated_rows.append([term.replace(last_word,singular), definition, 'NN'])

        if tag == 'NN':
            # Pluralize the term
            plural = plurals_engine.plural(last_word)

            # Create a new row with the plural term, same definition, and POS tag "NNS"
            updated_rows.append([term.replace(last_word,plural), definition, 'NNS'])
            
# Write the updated rows back to the CSV file
with open('inputs/definitions_POSNew.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(updated_rows)


process_transcript takes in 4 arguments: transcript, embedding, definitions, and output.

1. First, the code loads the terms and embeddings.
2. Search through the transcript for any terms. 
3. If a term is found, take the embeddings of the term in the transcript in the context of its sentence. 
4. Use cosine similarity to compare the embeddings of the term from definitions and the term in the transcript.
5. If the cosine is above a threshold (set as 0.66), replace the term in the transcript with the definition.

In [7]:
def load_terms_and_embeddings(file_path):
    terms = []
    embeddings = []

    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            terms.append(row[0])
            embeddings.append(row[1])

    return terms, embeddings

def load_term_definitions(file_path):
    term_definitions = {}

    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            term_definitions[row[0]] = row[1]

    return term_definitions

def process_transcripts(transcript_file, embedding_file, definition_file, output_file):
    terms, embeddings = load_terms_and_embeddings(embedding_file)
    term_definitions = load_term_definitions(definition_file)

    with open(transcript_file, 'r') as transcript_csv:
        transcripts = list(csv.reader(transcript_csv))

    for i in range(1, len(transcripts)):  # Skip the header row
        print("Transcript",i)
        transcript = transcripts[i][1]  # Assuming the transcripts are in the second column (index 1)
        for j in range(len(terms)):
            term = terms[j]
            term_regex = r"\b" + re.escape(term) + r"\b"  # Create a regex pattern to match the term as a whole word
            if re.search(term_regex, transcript):  # Check if the term is present in the transcript
                definition = term_definitions.get(term)
                
                if definition is not None:

                    # Find the sentences containing the term and print them
                    sentences = re.split(r'\.', transcript)  # Split the transcript into sentences
                    for sentence in sentences:
                        if term in sentence:
                            try:
                                transcript_embedding = word_embedding(sentence, term,'max')
                                def_embedding = torch.tensor(ast.literal_eval(embeddings[j]))
                                cossim = torch.cosine_similarity(transcript_embedding.reshape(1,-1), def_embedding.reshape(1,-1))

                                if(cossim[0] > 0.66): 
                                    print(term,'- above threshold:',cossim[0].detach())
                                    transcript = re.sub(term_regex, definition, transcript)  # Replace the term with its definition
                                else:
                                    print(term,'- below threshold')
                            except ValueError:
                                print("VALUEERROR")
                            except IndexError:
                                print("INDEXERROR")

        transcripts[i][1] = transcript

    with open(output_file, 'w', newline='') as output_csv:
        writer = csv.writer(output_csv)
        writer.writerows(transcripts)

process_transcripts('inputs/transcripts_100.csv', 'inputs/term_embeddings.csv', 'inputs/definitions_POSNew.csv', 'inputs/transcripts_with_definitions_100.csv')

Transcript 1
allergies - above threshold: tensor(0.7054)
allergies - above threshold: tensor(0.8067)
allergies - above threshold: tensor(0.7437)
asthma - above threshold: tensor(0.7981)
blood pressure - below threshold
mucosa - above threshold: tensor(0.7064)
will - below threshold
will - below threshold
Transcript 2
allergic - above threshold: tensor(0.7314)
apnea - above threshold: tensor(0.7636)
arrhythmia - above threshold: tensor(0.7417)
artery - below threshold
arthritis - below threshold
asthma - above threshold: tensor(0.6849)
atrial fibrillation - above threshold: tensor(0.8416)
blood pressure - above threshold: tensor(0.7286)
cancer - above threshold: tensor(0.7166)
carbohydrates - below threshold
cells - below threshold
cholesterol - above threshold: tensor(0.7699)
congestive heart failure - above threshold: tensor(0.7982)
coronary - above threshold: tensor(0.7254)
diabetes - below threshold
diabetes - above threshold: tensor(0.7004)
embolism - above threshold: tensor(0.7741

In [7]:
def write_first_100_rows(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        rows = list(reader)[:100]

    with open('inputs/transcripts_100.csv', 'w', newline='') as new_file:
        writer = csv.writer(new_file)
        writer.writerows(rows)

# Usage:
write_first_100_rows('inputs/clean_transcriptions.csv')
