In [73]:
# Imports
from openai import OpenAI
import openai
import pinecone
from nltk import sent_tokenize
import pandas as pd
import os
import dotenv

In [36]:
# df to store embeddings
df = pd.DataFrame()
df

In [37]:
# read file and add all sentences to a dataframe
def read(filename):
    file = open(f"./Data/{filename}")

    combined = ' '.join(file.readlines())
    combined = combined.replace('\n', '')
    # print(combined)

    chunked = sent_tokenize(combined)
    # print(chunked)

    df = pd.DataFrame()
    df['sentence'] = chunked
    df['doc'] = 'Cardiovascular System - Anatomy of Blood Vessels - Coursera.txt'
    # print(df.head())

    return df

In [38]:
# create dataframe with all files from data

df = pd.DataFrame()

for i in os.listdir('./Data'):
    nDf = read(filename=i)
    df = pd.concat([df, nDf])

df

Unnamed: 0,sentence,doc
0,The vital importance of the heart is obvious.,Cardiovascular System - Anatomy of Blood Vesse...
1,If one assumes an average rate of contraction ...,Cardiovascular System - Anatomy of Blood Vesse...
2,Each of the major pumping chambers of the hear...,Cardiovascular System - Anatomy of Blood Vesse...
3,This would be equal to 5.25 liters of fluid pe...,Cardiovascular System - Anatomy of Blood Vesse...
4,"Over one year, that would equal 10,000,000 lit...",Cardiovascular System - Anatomy of Blood Vesse...
...,...,...
691,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
692,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
693,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
694,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...


In [39]:
# environment variables
EMBEDDING_MODEL = 'text-embedding-3-small'
env = dotenv.load_dotenv('./.env')
env

True

In [40]:
# environment keys
vals = dotenv.dotenv_values()

OPENAI_KEY = vals['OPENAI_KEY']
PINECONE_KEY = vals["PINECONE_KEY"]
PINECONE_HOST = vals["PINECONE_HOST"]

In [74]:
# embed all sentences in the dataframe and save to a csv

client = OpenAI(api_key=OPENAI_KEY)
openai.api_key = OPENAI_KEY

def get_embedding(text):
    embedding = client.embeddings.create(input = [text], model=EMBEDDING_MODEL).data[0].embedding
    # print(text)
    # print(embedding)
    return embedding

# df['ada_embedding'] = df['sentence'].apply(lambda x: get_embedding(x))
# df.to_csv('Data/total_embeddings.csv', index=False)
# df

In [42]:
# display df

df

Unnamed: 0,sentence,doc
0,The vital importance of the heart is obvious.,Cardiovascular System - Anatomy of Blood Vesse...
1,If one assumes an average rate of contraction ...,Cardiovascular System - Anatomy of Blood Vesse...
2,Each of the major pumping chambers of the hear...,Cardiovascular System - Anatomy of Blood Vesse...
3,This would be equal to 5.25 liters of fluid pe...,Cardiovascular System - Anatomy of Blood Vesse...
4,"Over one year, that would equal 10,000,000 lit...",Cardiovascular System - Anatomy of Blood Vesse...
...,...,...
691,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
692,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
693,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...
694,""",Cardiovascular System - Anatomy of Blood Ves...",Cardiovascular System - Anatomy of Blood Vesse...


In [43]:
# save sentences

df.to_csv("./sentences.csv")

In [44]:
# PC crashed here, so had to use text output
embeddings = open("embeddings.txt")
lines = embeddings.readlines()

In [45]:
# initializing pinecone database
p = pinecone.Pinecone(api_key=PINECONE_KEY)

In [46]:
# reference to serverside vector store
index = p.Index(host=PINECONE_HOST)

In [47]:
# for i in range(0, len(lines), 2):
#     sentence = lines[i]
#     embed_strings = lines[i+1][1:len(lines[i+1]) - 2].split(',')
#     embed_floats = [float(j) for j in embed_strings]
    
#     index.upsert([(str(i/2), embed_floats, {'sentence': sentence})])

In [61]:
# change this to change query
user_input = "What do increased levels of thyroid hormones do?"

In [50]:
# embed query
query = get_embedding()
query

What do increased levels of thyroid hormones do?
[-0.010554896667599678, 0.02628456801176071, 0.014528383500874043, 0.06406863033771515, 0.006915099918842316, 0.05597791448235512, 0.003131559817120433, -0.009769439697265625, -0.049488913267850876, 4.64840886706952e-05, 0.0035165876615792513, 0.04028931260108948, -0.01765994355082512, -0.021931186318397522, -0.020760701969265938, 0.008049648255109787, 0.019959842786192894, -0.03967326879501343, -0.011068266816437244, 0.03694213926792145, 0.042507074773311615, -0.02677740342915058, 0.032424475997686386, -0.010811581276357174, -0.03792781010270119, 0.01806037314236164, -0.015637263655662537, 0.006288787815719843, 0.029221046715974808, -0.015390845946967602, 0.008927511982619762, -0.017238980159163475, 0.03209592029452324, 0.005878091789782047, -0.025298895314335823, -0.01606849581003189, -0.010298211127519608, 0.016848817467689514, 0.0050643994472920895, 0.011776718311011791, 0.052733413875103, -0.008727298118174076, -0.013419503346085548

[-0.010554896667599678,
 0.02628456801176071,
 0.014528383500874043,
 0.06406863033771515,
 0.006915099918842316,
 0.05597791448235512,
 0.003131559817120433,
 -0.009769439697265625,
 -0.049488913267850876,
 4.64840886706952e-05,
 0.0035165876615792513,
 0.04028931260108948,
 -0.01765994355082512,
 -0.021931186318397522,
 -0.020760701969265938,
 0.008049648255109787,
 0.019959842786192894,
 -0.03967326879501343,
 -0.011068266816437244,
 0.03694213926792145,
 0.042507074773311615,
 -0.02677740342915058,
 0.032424475997686386,
 -0.010811581276357174,
 -0.03792781010270119,
 0.01806037314236164,
 -0.015637263655662537,
 0.006288787815719843,
 0.029221046715974808,
 -0.015390845946967602,
 0.008927511982619762,
 -0.017238980159163475,
 0.03209592029452324,
 0.005878091789782047,
 -0.025298895314335823,
 -0.01606849581003189,
 -0.010298211127519608,
 0.016848817467689514,
 0.0050643994472920895,
 0.011776718311011791,
 0.052733413875103,
 -0.008727298118174076,
 -0.013419503346085548,
 0.00

In [54]:
# get top 5 documents from the vector store
docs = index.query(top_k=5, vector=query)
docs

{'matches': [{'id': '470.0', 'score': 0.633533716, 'values': []},
             {'id': '474.0', 'score': 0.55880785, 'values': []},
             {'id': '472.0', 'score': 0.485366166, 'values': []},
             {'id': '471.0', 'score': 0.480531275, 'values': []},
             {'id': '540.0', 'score': 0.416634649, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [57]:
# get all valid ideas from the server response with threshold
valid_ids = []

for i in docs['matches']:
    if(i['score'] > 0.4):
        valid_ids.append(i['id'])
valid_ids

['470.0', '474.0', '472.0', '471.0', '540.0']

In [59]:
# fetch all associated vectors with metadata
fetched = index.fetch(valid_ids)['vectors']
fetched

{'471.0': {'id': '471.0',
  'metadata': {'sentence': 'The impact of thyroid hormone is typically of a '
                           'much longer duration than that of the '
                           'catecholamines.\n'},
  'values': [-0.0438908637,
             0.0296973,
             -0.00326724909,
             0.0422749817,
             -0.0138332648,
             0.0662075132,
             0.027273478,
             -0.00633251294,
             0.0124684991,
             -0.00995733,
             0.0118570849,
             -0.0299375,
             0.022818882,
             -0.0228407197,
             0.0118352482,
             0.00874541886,
             0.00508511718,
             -0.0225131754,
             0.0107761901,
             0.0190302934,
             0.0296317935,
             0.00793201849,
             0.0439127,
             0.00144528691,
             0.00564467115,
             -0.00278821634,
             -0.0374710076,
             0.0248933267,
             0.009

In [70]:
# create end query

queryText = f"Query: \n{user_input}\n\nAdditional Info:\n"
print(query)

Query: 
What do increased levels of thyroid hormones do?

Additional Info:
The impact of thyroid hormone is typically of a much longer duration than that of the catecholamines.
In addition to their stimulatory effects on HR, they also bind to both alpha and beta receptors on the cardiac muscle cell membrane to increase metabolic rate and the force of contraction.
Excessive levels of thyroxin may trigger tachycardia.
Thyroid Hormones :  In general, increased levels of thyroid hormone, or thyroxin, increase cardiac rate and contractility.
The physiologically active form of thyroid hormone, T3 or triiodothyronine, has been shown to directly enter cardiomyocytes and alter activity at the level of the genome.



In [71]:
# append all sentences to the end query
for val in fetched.keys():
    queryText += fetched[val]['metadata']['sentence']

print(queryText)

Query: 
What do increased levels of thyroid hormones do?

Additional Info:
The impact of thyroid hormone is typically of a much longer duration than that of the catecholamines.
In addition to their stimulatory effects on HR, they also bind to both alpha and beta receptors on the cardiac muscle cell membrane to increase metabolic rate and the force of contraction.
Excessive levels of thyroxin may trigger tachycardia.
Thyroid Hormones :  In general, increased levels of thyroid hormone, or thyroxin, increase cardiac rate and contractility.
The physiologically active form of thyroid hormone, T3 or triiodothyronine, has been shown to directly enter cardiomyocytes and alter activity at the level of the genome.



In [None]:
# query chatgpt
response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[
        {"role":"system", "content":"You are a helpful assistant that answers the given query with the context given by the documents. The format of queries to you will be Query (line break) Additional Info: (additional info)"},
        {"role":"user", "content":query},
    ]
)

In [86]:
# return the response to the user
print(response.choices[0].message.content)

Increased levels of thyroid hormones, specifically thyroxin, have several effects on the body. Firstly, they increase the cardiac rate and contractility, leading to an increase in heart rate. This is due to the binding of thyroid hormones to both alpha and beta receptors on the cardiac muscle cell membrane. Additionally, excessive levels of thyroxin can trigger tachycardia, a condition characterized by an excessively fast heart rate. It is also important to note that the impact of thyroid hormones on the body is typically longer-lasting compared to the effects of catecholamines. Furthermore, the active form of thyroid hormone, T3 or triiodothyronine, can directly enter cardiomyocytes and alter activity at the level of the genome.
