In [249]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [250]:
import os
import requests
from transformers import AutoTokenizer
import json
import PyPDF2 as pdf_loader
import pandas as pd
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import scipy

# Install all of API keys for Groq

In [251]:
groq_api_key = os.environ.get("GROQ_API_KEY")

# Install the source documents

In [252]:
model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model, clean_up_tokenization_spaces=False)

In [253]:
sample = "Hi, how are you doing???"
encoded = tokenizer(sample, padding=True, add_special_tokens=False)
print(encoded)

{'input_ids': [7632, 1010, 2129, 2024, 2017, 2725, 1029, 1029, 1029], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [254]:
source_folder = "/Users/presteddy56"
with open(f"{source_folder}/resources/doc_name.json", 'r') as file:
    data = json.load(file)
print(data)


{'countries': ['India', 'United States', 'United Kingdom'], 'files': ['clinical_practice_guidelines_for_the_management_of.11.pdf', 'APA_guideline.pdf', 'NICE_guideline.pdf']}


In [255]:
import PyPDF2 as pdf_loader
full_documents = {}
root = "/Users/presteddy56/resources/pdfs/"
for i in range(len(data['files'])):
    country_name = data["countries"][i]
    file_path = root+data["files"][i]
    contents = pdf_loader.PdfReader(file_path)
    pages = ""
    for m in range(40):
        try:
            page = contents.pages[m].extract_text()
            pages=pages + page
        except:pass
    full_documents[country_name] = pages

In [256]:
#loader ={}
#full_pages = {}
#for i in range(len(data['files'])):
#    country_name = data["countries"][i]
#    loader[country_name] = PyPDFLoader(f"{source_folder}/resources/pdfs/{data["files"][i]}",extract_images=False)
#    full_pages[country_name] = loader[country_name].load_and_split()

In [257]:
sample_text = full_documents["United States"]
for i in full_documents:
    print(i)

India
United States
United Kingdom


In [258]:
def split_into_many(text: str, tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased",clean_up_tokenization_spaces=False), max_tokens: int = 512) -> list:
    """ Function to split a string into many strings of a specified number of tokens """

    
    sentences = text.split('. ') #A
    n_tokens = []
    new_sentences = ""
    for sentence in sentences:
        TK = tokenizer(" " + sentence, padding=True, add_special_tokens=True,truncation=True, max_length=512)
        n_tokens.append(len(TK["input_ids"]))
        sentence += ". "
        new_sentences += sentence
    
    new_sentences = new_sentences.split(". ")
 #B
    chunks = []
    tokens_so_far = 0
    chunk = []

    for sentence, token in zip(new_sentences, n_tokens): #C

        if tokens_so_far + token > max_tokens: #D 
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        if token > max_tokens:#E 
            continue
        chunk.append(sentence) #F
        tokens_so_far += token + 1

    return chunks
#A Split the text into sentences
#B Get the number of tokens for each sentence
#C Loop through the sentences and tokens joined together in a tuple
#D If the number of tokens so far plus the number of tokens in the current sentence is greater than the max number of tokens, then add the chunk to the list of chunks and reset
#E If the number of tokens in the current sentence is greater than the max number of tokens, go to the next sentence
#F # Otherwise, add the sentence to the chunk and add the number of tokens to the total

In [259]:
def tokenize(text,max_tokens) -> pd.DataFrame:
    """ Function to split the text into chunks of a maximum number of tokens """

    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased",clean_up_tokenization_spaces=True) #A

    limitted_doc = split_into_many(text, tokenizer)
    df = pd.DataFrame({'title':[], 'text':[],'n_tokens':[]})
    for i in range(len(limitted_doc)):
        df_tmp= pd.DataFrame({'title':[], 'text':[],'n_tokens':[]})
        df_tmp.at[0,'title']=0
        texts=str(limitted_doc[i])
        df_tmp.at[0,'text'] = texts
        TK = tokenizer(" " + texts, padding=True, add_special_tokens=True,truncation=True, max_length=512)
        df_tmp['n_tokens']= len(TK["input_ids"])
        df = pd.concat([df, df_tmp], axis=0)
    #B
    shortened = []

    for row in df.iterrows():

        if row[1]['text'] is None: #C
            continue

        if row[1]['n_tokens'] > max_tokens: #D
            shortened += split_into_many(row[1]['text'], tokenizer, max_tokens)

        else: #E
            shortened.append(row[1]['text'])

    df = pd.DataFrame(shortened, columns=['text'])
    
    df['n_tokens'] = df.text.apply(lambda x: len(tokenizer(x, padding=True, add_special_tokens=True,truncation=True, max_length=512)["input_ids"]))

    return df
#A Load the transformaer tokenizer which is designed to work with the distilbert-base-uncased
#B Tokenize the text and save the number of tokens to a new column
#C If the text is None, go to the next row
#D If the number of tokens is greater than the max number of tokens, split the text into chunks
#E Otherwise, add the text to the list of shortened texts

In [260]:
tokenized = pd.DataFrame(({'text':[], 'n_tokens':[],'countries':[]}))
for i in full_documents:
    single_tokenized = tokenize(full_documents[i],512)
    single_tokenized['countries'] = i
    tokenized = pd.concat([tokenized,single_tokenized],axis=0, ignore_index=True)

Management of depression involves comprehensive 
assessment and proper establishment of diagnosis. The 
assessment must be based on detailed history, physical 
examination and mental state examinations.History must be 
obtained from all sources, especially the family. The diagnosis 
must be recorded as per the current diagnostic criteria. 
Depression often presents with a combination of 
symptoms of depressed mood, loss of interest or pleasure, 
decreased energy and fatigue, reduced concentration 
and attention,  reduced self-esteem and self-confidence, 
ideas of guilt and unworthiness, bleak and pessimistic 
views of the future, ideas or acts of self-harm or suicide, 
disturbed sleep and diminished appetite. Depending on 
the severity of depression some of these symptoms may 
be more marked and develop characteristic features that 
are widely regarded as having special clinical significance. 
These symptoms are known as somatic symptoms of 
depression and include symptoms of loss of i

In [261]:
tokenized.head()

Unnamed: 0,text,n_tokens,countries
0,© 2017 Indian Journal of Psychiatry | Publishe...,467.0,India
1,When depression occurs in the \ncontext of med...,364.0,India
2,Indian J Psychiatry 2017;59:34-50.This is an o...,512.0,India
3,A careful \nevaluation of these patients often...,336.0,India
4,\nIt is always important to take the longitudi...,475.0,India


In [262]:
query = "What is the best treatment for depression? you should provide both pharmacology and psycho social therapy"

In [263]:
def BM25(corpus:str, query:str):
    tokenized_corpus = [corpus.split(" ")]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    doc_scores=bm25.get_scores(tokenized_query)
    return doc_scores

In [264]:
bm_data = []
for pop in tokenized["text"]:
    bm_data_tmp = BM25(pop, query)
    bm_data.append(*bm_data_tmp)

tokenized["BM25"] = bm_data

In [265]:
hugging_face_api_key = os.environ.get("HF_TOKEN")
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = hugging_face_api_key

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Presteddy56 {hf_token}"}


In [266]:
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [267]:
texts = list(tokenized["text"])

In [268]:
output=embedding_model.encode(texts)

In [269]:
tokenized["embeddings"]=list(output)

In [270]:
query_embedding = embedding_model.encode(query)

In [271]:
cos =[]
for emb in tokenized["embeddings"]:
    cos_tmp = scipy.spatial.distance.cosine(emb,query_embedding)
    cos.append(cos_tmp)
tokenized["cosine"]=cos

In [272]:
tokenized["cosine"].describe()

count    165.000000
mean       0.680152
std        0.196678
min        0.285604
25%        0.513172
50%        0.637463
75%        0.881490
max        1.081101
Name: cosine, dtype: float64

In [273]:
tokenized["BM25"].describe()

count    165.000000
mean      -2.200427
std        0.774615
min       -3.553068
25%       -2.739387
50%       -2.343468
75%       -1.726391
max        0.000000
Name: BM25, dtype: float64

In [274]:
BM25_selection_1SD = tokenized.loc[tokenized["BM25"]>=tokenized["BM25"].mean()+tokenized["BM25"].std()]
cos_BM25_selection_1SD=cos_selection.loc[cos_selection["cosine"]>=tokenized["cosine"].mean()+tokenized["cosine"].std()]
display(cos_BM25_selection_1SD)

Unnamed: 0,text,n_tokens,countries,BM25,embeddings,cosine
8,Simple measures \nwhich can help in improving ...,402.0,India,-1.399423,"[-0.04302568, -0.29623517, -0.21660659, 0.0448...",0.982414
43,Guidelines differ from standards \nin that sta...,449.0,United States,-2.181859,"[-0.02832953, -0.04548462, -0.13310984, -0.110...",0.962611
46,It addresses three developmental cohorts: \nch...,491.0,United States,-1.855989,"[-0.11591277, -0.06624296, -0.2720606, -0.0933...",0.971025
50,"Briefly, with the support of the \nRTI-UNC Ev...",449.0,United States,-1.123581,"[-0.44738993, 0.07474688, -0.08138804, 0.06965...",0.988492
52,"53 of the guideline. Further, the panel noted...",430.0,United States,-1.912541,"[-0.23610339, 0.23025295, 0.010320518, 0.03267...",0.911107
89,How the current guideline comple -\nments thes...,448.0,United States,-1.836406,"[-0.30415174, 0.1141779, 0.068952054, -0.15646...",0.957117
91,Another goal of the panel was to take a method...,479.0,United States,-2.015312,"[-0.040086612, 0.065011375, 0.020500641, -0.06...",0.93632
97,While intellectual affilia -\ntions were expec...,483.0,United States,-1.291226,"[-0.18655144, -0.074852206, -0.08843546, 0.045...",1.005038
102,To view the list of keywords used in \nsearch...,486.0,United States,-1.166759,"[-0.109651834, 0.05208563, -0.12957194, -0.081...",0.983134
106,\nAmong the reviews that were not provided or ...,480.0,United States,-1.606118,"[-0.18377659, -0.07242546, -0.049490474, -0.05...",0.908734


In [275]:
BM25_selection_mean = tokenized.loc[tokenized["BM25"]>=tokenized["BM25"].mean()]
cos_BM25_selection_mean=cos_selection.loc[cos_selection["cosine"]>=tokenized["cosine"].mean()]
display(cos_BM25_selection_mean)

Unnamed: 0,text,n_tokens,countries,BM25,embeddings,cosine
8,Simple measures \nwhich can help in improving ...,402.0,India,-1.399423,"[-0.04302568, -0.29623517, -0.21660659, 0.0448...",0.982414
43,Guidelines differ from standards \nin that sta...,449.0,United States,-2.181859,"[-0.02832953, -0.04548462, -0.13310984, -0.110...",0.962611
46,It addresses three developmental cohorts: \nch...,491.0,United States,-1.855989,"[-0.11591277, -0.06624296, -0.2720606, -0.0933...",0.971025
50,"Briefly, with the support of the \nRTI-UNC Ev...",449.0,United States,-1.123581,"[-0.44738993, 0.07474688, -0.08138804, 0.06965...",0.988492
52,"53 of the guideline. Further, the panel noted...",430.0,United States,-1.912541,"[-0.23610339, 0.23025295, 0.010320518, 0.03267...",0.911107
89,How the current guideline comple -\nments thes...,448.0,United States,-1.836406,"[-0.30415174, 0.1141779, 0.068952054, -0.15646...",0.957117
91,Another goal of the panel was to take a method...,479.0,United States,-2.015312,"[-0.040086612, 0.065011375, 0.020500641, -0.06...",0.93632
97,While intellectual affilia -\ntions were expec...,483.0,United States,-1.291226,"[-0.18655144, -0.074852206, -0.08843546, 0.045...",1.005038
102,To view the list of keywords used in \nsearch...,486.0,United States,-1.166759,"[-0.109651834, 0.05208563, -0.12957194, -0.081...",0.983134
106,\nAmong the reviews that were not provided or ...,480.0,United States,-1.606118,"[-0.18377659, -0.07242546, -0.049490474, -0.05...",0.908734


In [287]:
BM25_selection_max = tokenized.loc[tokenized["BM25"]>=tokenized["BM25"].max()]
cos_BM25_selection_max=cos_selection.loc[cos_selection["cosine"]>=tokenized["cosine"].max()]
display(cos_BM25_selection_max)

Unnamed: 0,text,n_tokens,countries,BM25,embeddings,cosine
121,13 \n1.4 Deliv ery of tr eatment s ..............,499.0,United Kingdom,-0.274653,"[0.1803402, 0.18596578, 0.4367375, -0.37179384...",1.10212
122,62 \n1.9 Further-line tr eatment ................,429.0,United Kingdom,0.0,"[0.05864713, 0.37716228, 0.22813606, 0.1660433...",1.090019


In [310]:
vectorstore.delete(ids=uuids[:])

OperationalError: attempt to write a readonly database

In [309]:
from uuid import uuid4
from langchain_core.documents import Document
documents=[]
for i in cos_BM25_selection_max["countries"].unique():
    selected_documents=""
    for pop in cos_BM25_selection_max["text"].loc[cos_BM25_selection_max["countries"]==i]:
        selected_documents +=pop
        
    document_i = Document(
        page_content= selected_documents,
        metadata={"source":i},
        id=i,
    )
    documents.append(document_i)
    print(documents)

from langchain_huggingface import HuggingFaceEmbeddings
# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")
from langchain_chroma import Chroma

vectorstore = Chroma(
    collection_name="WhatStandards",
    embedding_function=embeddings,
    persist_directory="/Users/presteddy56/resources/chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

uuids = [str(uuid4()) for _ in range(len(documents))]

vectorstore.add_documents(documents=documents, ids=uuids)


[Document(id='United Kingdom', metadata={'source': 'United Kingdom'}, page_content='13 \n1.4 Deliv ery of tr eatment s ................................................................................................................ 14 \n1.5 Treatment f or a new episode of less se vere depr ession ......................................................... 28 \n1.6 Treatment f or a new episode of mor e severe depr ession ....................................................... 47 \n1.7 Beha vioural couples t herap y for depr ession ............................................................................ 61 \n1.8 Preventing r elapse .......................................................................................................................62 \n1.9 Further-line tr eatment ................................................................................................................. 65 \n1.10 Chr onic depr essiv e sympt oms ....................................................

OperationalError: attempt to write a readonly database

In [301]:
#from langchain_community.document_loaders import PyPDFDirectoryLoader
#loader = PyPDFDirectoryLoader("/Users/presteddy56/resources/pdfs/")
#docs = loader.load_and_split()
#print(docs)

In [311]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-70b-versatile", api_key=groq_api_key, temperature = 0.1)

In [312]:
#from langchain_huggingface import HuggingFaceEmbeddings
# Initialize the embedding model
#embed_model = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")

In [313]:
#from langchain_chroma import Chroma
#vectorstore = Chroma.from_documents(
#    ,
#    embedding = embed_model,
#    persist_directory = f"{source_folder}/resources",
#)

In [314]:
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

APA | Guideline for the Treatment of Depression  I
APA CLINICAL PRACTICE GUIDELINE 
for the Treatment of Depression 
Across Three Age Cohorts
GUIDELINE DEVELOPMENT PANEL FOR THE TREATMENT OF DEPRESSIVE DISORDERS
APPROVED BY APA COUNCIL OF REPRESENTATIVES  
FEBRUARY 2019II APA  | Guideline for the Treatment of DepressionCopyright © 2019 by the American Psychological Association. This material may be reproduced and distributed without permission provided that 
acknowledgment is given to the American Psychological Association. This material may not be reprinted, translated, or distributed electronically without 
prior permission in writing from the publisher. For permission, contact APA, Rights and Permissions, 750 First Street, NE, Washington, DC 20002-4242. 
This document was approved by the APA Council of Representatives over the course of its February 2019 meeting and is set to expire in approximately 
2024. It is available online at https:/ /www.apa.org/ depression-guideline. 
Please

In [282]:
retriever =vectorstore.as_retriever()

In [283]:
from langchain_core.prompts import PromptTemplate

template = (""" You are a medical doctor and advisor who precisely answers questions related to standard treatment in a diagnosis.
                Use the provided guidelines to answer the questions.
                If you don't know the answer, say so. Do not discuss the context in your response; just provide the answer directly. 
                Then, provide a reference in the given guidelines to review the details. 
                You should refer to the given guidelines as much as possible. 
                If you add something outside the given guideline, you should mention that references are from outside of the given guidelines.

                Guidelines:{guidelines}
                
                Question:{question}
                
                Answer:""")
rag_prompt = PromptTemplate.from_template(template)

In [284]:
# add to the prompt like "You are a clinical psychiatrist. You are supposed to treat depression. Provide precise and easy to understand answers"
# gather the data for schizoph

In [285]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"guidelines": retriever, "question":RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [286]:
from IPython.display import display, Markdown
response = rag_chain.invoke(query)
Markdown(response)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-70b-versatile` in organization `org_01j81pvpwpe63rq3pa38hs789w` on tokens per minute (TPM): Limit 20000, Used 0, Requested 179295. Please try again in 7m57.885s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [246]:
import gradio as gr

def rag_memory_stream(text):
    partial_text =""
    for new_text in rag_chain.stream(text):
        partial_text += new_text
        # Yield an empty string to clearnup the message textbox and the updated conversation history
        yield partial_text
title = "Real-time AI App with Groq API and LangChain"
description = """<center> <img src="" alt = "logo" width="550"/></center>"""

demo = gr.Interface(
    title = title,
    description = description,
    fn = rag_memory_stream,
    inputs = "text",
    outputs= "text",
    live = False,
    allow_flagging="never",
)
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/Users/presteddy56/Library/Caches/pypoetry/virtualenvs/whatstandard-dAjhHrvy-py3.12/lib/python3.12/site-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/presteddy56/Library/Caches/pypoetry/virtualenvs/whatstandard-dAjhHrvy-py3.12/lib/python3.12/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/presteddy56/Library/Caches/pypoetry/virtualenvs/whatstandard-dAjhHrvy-py3.12/lib/python3.12/site-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/presteddy56/Library/Caches/pypoetry/virtualenvs/whatstandard-dAjhHrvy-py3.12/lib/python3.12/site-packages/gradio/blocks.py", line 1532, in call_function
    pred