# Analysing Embedding from OpenAI library #

In [22]:
from itertools import product

import os
import openai

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F

In [27]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

torch.set_printoptions(linewidth=200)

## Setting up OpenAI key ##
Using personal key for access to OpenAI models

In [2]:
openai.api_key = os.getenv('OPENAI_API_KEY')
print(os.getenv('OPENAI_API_KEY'))

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


## Sanity Tests ##
Few sanity tests for API

In [3]:
chat_completion = openai.ChatCompletion.create(model='gpt-3.5-turbo', messages=[{'role':'user', 'content':'Hello World'}])

In [4]:
print(chat_completion)

{
  "id": "chatcmpl-7wNuAnEiHhF3vwy2GHU1CDChslGAJ",
  "object": "chat.completion",
  "created": 1694149270,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 9,
    "completion_tokens": 9,
    "total_tokens": 18
  }
}


## Embeddings for reports ##
Getting the embeddings and storing it in local files

In [5]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [7]:
sentence_list = ["The report shows small right-sided pleural effusion", "The report shows small left-sided pleural effusion",\
    "The report shows large right-sided pleural effusion", "The report shows large left-sided pleural effusion",\
    "There are no abnormalities in the report",\
    "There is severe consolidation in the left side","There is severe consolidation in the right side",\
    "There is mild consolidation in the right side", "There is mild consolidation in the left side"
]

In [10]:
sentence1_base = "A <SizeModifier> <AbnormalReport> can be seen in the report in the <LocationModifier> part"
sentence2_base = "The report shows a <SizeModifier> <LocationModifier> <AbnormalReport>"
size_modifiers = ['small', 'large']
loc_modifiers = ['upper-left', 'lower-left', 'right-sided', 'left-sided']
abnormal_report = ['pleural effusion']

l1 = [sentence1_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]
l2 = [sentence2_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]

In [13]:
negative_sentences = ['The report shows no pleural effusion', 'The report shows no consolidation on any side']
all_sentence_list = l1 + l2 + negative_sentences + sentence_list[4:]

In [12]:
print('\n'.join(all_sentence_list))

A small pleural effusion can be seen in the report in the upper-left part
A small pleural effusion can be seen in the report in the lower-left part
A small pleural effusion can be seen in the report in the right-sided part
A small pleural effusion can be seen in the report in the left-sided part
A large pleural effusion can be seen in the report in the upper-left part
A large pleural effusion can be seen in the report in the lower-left part
A large pleural effusion can be seen in the report in the right-sided part
A large pleural effusion can be seen in the report in the left-sided part
The report shows a small upper-left pleural effusion
The report shows a small lower-left pleural effusion
The report shows a small right-sided pleural effusion
The report shows a small left-sided pleural effusion
The report shows a large upper-left pleural effusion
The report shows a large lower-left pleural effusion
The report shows a large right-sided pleural effusion
The report shows a large left-sid

In [14]:
df = pd.DataFrame({'report':all_sentence_list})
df['ada_embedding'] = df.report.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('embedded_test_sentences1.csv', index=False)

In [29]:
dataset1_template = """xr- chest pa  view
findings
lungs: normal.
trachea: normal.
carina: normal.
right and left main bronchi: normal.
pleura: normal.
heart: normal.
right heart border: normal.
left heart border: normal.
pulmonary bay: normal.
pulmonary hila: normal.
aorta: normal.
thoracic spine: normal.
other visualized bones: normal.
visualized soft tissues: normal.
diaphragm: normal.
visualized abdomen:  normal.
visualized neck: normal."""

dataset1_pleural_issue="""
xr- chest pa view
findings
lungs: normal.
trachea: normal.
carina: normal.
right and left main bronchi: normal.
pleura: left costophrenic angle is blunted with thin stripe of homogenous opacity along left lateral chest wall.
heart: normal.
right heart border: normal.
left heart border: normal.
pulmonary bay: normal.
pulmonary hila: normal.
aorta: normal.
thoracic spine: normal.
other visualized bones: normal.
visualized soft tissues: normal.
diaphragm: normal.
visualized abdomen:  normal.
visualized neck: normal."""

dataset2_template="""6191206|3862169|x-ray chest pa/ap view of 09-feb-2018:
results:
post cabg status.
no focal lesion seen in the lung parenchyma.
cp angles and domes of the diaphragm are normal.
both hila are normal. pulmonary vasculature is normal.
cardiac size and configuration is normal.
trachea is central; no mediastinal shift is seen.
bony thorax and soft tissues of the chest wall are normal.
impression: no abnormality detected in the view obtained.
"""

dataset3_template="""
x-ray chest (pa view)
the cardio thoracic ratio is normal.
the heart size and configuration are within normal limits.
the aortic arch is normal.
the lung fields show normal broncho-vascular markings.
both the pulmonary hila are normal in size.
the costophrenic and cardiophrenic recesses and the domes of
diaphragm are normal.
the bones and soft tissues of the chest wall show no abnormality.
impression : normal study.
dr.shakthi kumar
radiologist
ss
________________________________________________________
"""

In [30]:
df2 = pd.DataFrame({'report': [dataset1_template, dataset1_pleural_issue, dataset2_template, dataset3_template]})
df2['ada_embedding'] = df2.report.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df2.to_csv('embedded_test_sentences2.csv', index=False)

## Consine matrix for the embeddings ##

In [38]:
def calc_cosine_sim_matrix(sentence_embeddings):
    stacked_sentence_embeddings = torch.stack(sentence_embeddings)
    # Calculate the cosine similarity matrix
    cosine_sim_matrix = F.cosine_similarity(stacked_sentence_embeddings.unsqueeze(1), stacked_sentence_embeddings.unsqueeze(0), dim=2)
    return stacked_sentence_embeddings, cosine_sim_matrix

In [47]:
def toList(list_str):
    return list(map(float, list_str[1:-1].split(', ')))

In [49]:
df1 = pd.read_csv('embedded_test_sentences1.csv')
df1.ada_embedding = df1.ada_embedding.apply(toList)

In [50]:
print(df1.head())
print(df1.describe())

                                              report                                      ada_embedding
0  A small pleural effusion can be seen in the re...  [-0.010283100418746471, 0.026601657271385193, ...
1  A small pleural effusion can be seen in the re...  [-0.01526658982038498, 0.026450255885720253, 0...
2  A small pleural effusion can be seen in the re...  [-0.018871692940592766, 0.026934247463941574, ...
3  A small pleural effusion can be seen in the re...  [-0.014083346351981163, 0.03057301975786686, 0...
4  A large pleural effusion can be seen in the re...  [-0.009492901153862476, 0.022884558886289597, ...
                                                   report                                      ada_embedding
count                                                  23                                                 23
unique                                                 23                                                 23
top     A small pleural effusion can be seen in t

In [51]:
_, consine_sim_matrix1 = calc_cosine_sim_matrix(list(torch.Tensor(df1.ada_embedding)))

In [52]:
print(consine_sim_matrix1)

tensor([[1.0000, 0.9907, 0.9726, 0.9775, 0.9891, 0.9837, 0.9609, 0.9670, 0.9690, 0.9528, 0.9464, 0.9519, 0.9624, 0.9520, 0.9372, 0.9443, 0.8964, 0.7881, 0.8011, 0.8569, 0.8461, 0.8709, 0.8781],
        [0.9907, 1.0000, 0.9738, 0.9817, 0.9775, 0.9903, 0.9584, 0.9679, 0.9643, 0.9645, 0.9500, 0.9584, 0.9495, 0.9625, 0.9332, 0.9443, 0.8953, 0.7906, 0.7997, 0.8613, 0.8471, 0.8748, 0.8851],
        [0.9726, 0.9738, 1.0000, 0.9861, 0.9617, 0.9659, 0.9881, 0.9745, 0.9488, 0.9421, 0.9655, 0.9556, 0.9384, 0.9382, 0.9571, 0.9460, 0.8960, 0.7991, 0.8047, 0.8683, 0.8771, 0.8979, 0.8870],
        [0.9775, 0.9817, 0.9861, 1.0000, 0.9680, 0.9748, 0.9754, 0.9896, 0.9544, 0.9510, 0.9557, 0.9681, 0.9469, 0.9521, 0.9472, 0.9614, 0.8953, 0.7985, 0.8005, 0.8822, 0.8660, 0.8900, 0.9026],
        [0.9891, 0.9775, 0.9617, 0.9680, 1.0000, 0.9902, 0.9718, 0.9778, 0.9509, 0.9337, 0.9284, 0.9359, 0.9737, 0.9558, 0.9493, 0.9551, 0.8912, 0.7897, 0.7975, 0.8640, 0.8541, 0.8602, 0.8678],
        [0.9837, 0.9903, 0.965

In [53]:
df2 = pd.read_csv('embedded_test_sentences2.csv')
df2.ada_embedding = df2.ada_embedding.apply(toList)

In [54]:
print(df2.head())
print(df2.describe())

                                              report                                      ada_embedding
0  xr- chest pa  view\nfindings\nlungs: normal.\n...  [-0.018094871193170547, 0.01764216646552086, 0...
1  \nxr- chest pa view\nfindings\nlungs: normal.\...  [-0.021756017580628395, 0.028477167710661888, ...
2  6191206|3862169|x-ray chest pa/ap view of 09-f...  [-0.010397231206297874, 0.026680193841457367, ...
3  \nx-ray chest (pa view)\nthe cardio thoracic r...  [0.0025291878264397383, 0.019118065014481544, ...
                                                   report                                      ada_embedding
count                                                   4                                                  4
unique                                                  4                                                  4
top     xr- chest pa  view\nfindings\nlungs: normal.\n...  [-0.018094871193170547, 0.01764216646552086, 0...
freq                                        

In [55]:
_, consine_sim_matrix2 = calc_cosine_sim_matrix(list(torch.Tensor(df2.ada_embedding)))

In [56]:
print(consine_sim_matrix2)

tensor([[1.0000, 0.9637, 0.9024, 0.9125],
        [0.9637, 1.0000, 0.8912, 0.9063],
        [0.9024, 0.8912, 1.0000, 0.9116],
        [0.9125, 0.9063, 0.9116, 1.0000]])


## Misc Tests ##

In [57]:
print(df2.ada_embedding)
print(torch.Tensor(df2.ada_embedding))
print(torch.Tensor([[1, 2, 3], [1, 1, 5]]))
_, consine_sim_matrix3 = calc_cosine_sim_matrix(list(torch.Tensor([[1, 2, 3], [1, 1, 5]])))
print(consine_sim_matrix3)


0    [-0.018094871193170547, 0.01764216646552086, 0...
1    [-0.021756017580628395, 0.028477167710661888, ...
2    [-0.010397231206297874, 0.026680193841457367, ...
3    [0.0025291878264397383, 0.019118065014481544, ...
Name: ada_embedding, dtype: object
tensor([[-0.0181,  0.0176,  0.0213,  ...,  0.0016, -0.0061, -0.0341],
        [-0.0218,  0.0285,  0.0143,  ...,  0.0073,  0.0019, -0.0346],
        [-0.0104,  0.0267,  0.0137,  ..., -0.0061, -0.0096, -0.0348],
        [ 0.0025,  0.0191,  0.0113,  ..., -0.0061,  0.0004, -0.0397]])
tensor([[1., 2., 3.],
        [1., 1., 5.]])
tensor([[1.0000, 0.9258],
        [0.9258, 1.0000]])
