In [1]:
import itertools

from fastapi import FastAPI, Form, Request, Response, File, Depends, HTTPException, status
from fastapi.responses import RedirectResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from fastapi.encoders import jsonable_encoder
from langchain_community.llms import CTransformers, LlamaCpp
from langchain.chains import QAGenerationChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os 
import json
import time
import uvicorn
import aiofiles
from PyPDF2 import PdfReader
import csv
from functools import lru_cache

app = FastAPI()

app.mount("/static", StaticFiles(directory="static"), name="static")

templates = Jinja2Templates(directory="templates")

#@lru_cache(maxsize=4096)  # Adjust maxsize as needed
def load_llm():
    # Load the locally downloaded model here
    config = {
        'gpu_layers': 10000,
        #'model_type': "mistral",
        'max_new_tokens' : 2048,
        'context_length' : 4096,
        'temperature' : 0,
        }

    llm = CTransformers(
        #model = 'mistral-7b-instruct-v0.1.Q5_K_M.gguf',#"mistral-7b-instruct-v0.1.Q4_K_S.gguf",
        model = '/home/rps/projects/Question-Answer-Generation-App-using-Mistral-7B/NousResearch/gguf/nous-hermes-2-solar-10.7b.Q5_K_M.gguf',
        config = config
    )
    return llm

#######################################################################
def load_llm_llama():
    model_id = 'qwen1_5-14b-chat-q5_0.gguf'
    n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
    n_batch = 2048  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path= model_id,
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        # callback_manager=callback_manager,
        temperature=0,
        max_tokens=2048,
        context_length = 4096,
        f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
        #top_p=1,
        verbose=False,  # Verbose is required to pass to the callback manager
    )
    return llm
########################################################################
llm = load_llm()

def file_processing(file_path):

    # Load data from PDF
    loader = PyPDFLoader(file_path)
    data = loader.load()

    question_gen = ''

    for page in data:
        question_gen += page.page_content
        
    splitter_ques_gen = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )

    chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

    document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

    splitter_ans_gen = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 30
    )


    document_answer_gen = splitter_ans_gen.split_documents(
        document_ques_gen
    )

    return document_ques_gen, document_answer_gen

def pipe_vector(file_path):

    document_ques_gen, document_answer_gen = file_processing(file_path)

    #llm_ques_gen_pipeline = load_llm()

    prompt_template = """
    You are an expert at creating questions based in medicine materials and documentation.
    Your goal is to prepare a very good material for their exam and  tests.
    You do this by asking questions about the text below:

    ------------
    {text}
    ------------

    Create questions that will prepare medicine students for their tests.
    Make sure not to lose any important information.

    QUESTIONS:
    """

    PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

    refine_template = ("""
    You are an expert at creating practice questions based on medicine and books.
    Your goal is to help to prepare medicine students for a test.
    We have received some practice questions to a certain extent: {existing_answer}.
    We have the option to refine the existing questions or add new ones.
    (only if necessary) with some more context below.
    ------------
    {text}
    ------------

    Given the new context, refine the original questions in Portuguese.
    If the context is not helpful, please provide the original questions.
    QUESTIONS:
    """
    )

    REFINE_PROMPT_QUESTIONS = PromptTemplate(
        input_variables=["existing_answer", "text"],
        template=refine_template,
    )

    ques_gen_chain = load_summarize_chain(llm = llm, 
                                            chain_type = "refine", 
                                            verbose = True, 
                                            question_prompt=PROMPT_QUESTIONS, 
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

    #ques = ques_gen_chain.run(document_ques_gen)
    ques_l = [ ques_gen_chain.run( [q]) for q in document_ques_gen]
    ques = "\n".join(ques_l).split("\n")
    embeddings = HuggingFaceBgeEmbeddings(model_name= "intfloat/multilingual-e5-large-instruct")  #"sentence-transformers/all-mpnet-base-v2")

    vector_store = FAISS.from_documents(document_answer_gen, embeddings)
    return ques, vector_store

def pipe_gen(vector_store, ques):
    # llm_answer_gen = load_llm()
    #ques_list = ques
    filtered_ques_list = [element for element in ques if element.endswith('?') or element.endswith('.')]
    #filtered_ques_list = ques
    answer_generation_chain = RetrievalQA.from_chain_type(llm=llm, 
                                                chain_type="stuff", 
                                                retriever=vector_store.as_retriever())
    return answer_generation_chain, filtered_ques_list


def get_csv (file_path):
    # answer_generation_chain, ques_list = llm_pipeline(file_path)
    vector_store = pipe_vector(file_path)
    answer_generation_chain, ques_list = pipe_gen(vector_store)

    base_folder = 'static/output/'
    if not os.path.isdir(base_folder):
        os.mkdir(base_folder)
    output_file = base_folder+"QAx.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Question", "Helpful Answer"])  # Writing the header row
        #answer0=[]
        for question in ques_list:
            print("Question: ", question)
            answer = answer_generation_chain.run(question)
            #answer0.append(answer)
            print("Answer: ", answer)
            print("--------------------------------------------------\n\n")

            # Save answer to CSV file
            csv_writer.writerow([question, answer])
    return output_file

In [2]:
file_path = '//home/rps/Downloads/REUMATOLOGIA.pdf'
#file_path = '/home/rps/Downloads/ansiedade_4_8.pdf'
que,ans = file_processing(file_path)
len(que),len(ans)



(132, 525)

In [3]:
que[:2]

[Document(page_content='J√©ssica Nicolau - 1 REUMATOLOGIA  \n \n_________________________________________________________________________________  \n \nT√ìPICOS  \n1. L√∫pus Eritematoso Sist√™mico  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óã Manifesta√ß√µes sist√™micas  \n‚óã Manifesta√ß√µes laboratoriais  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n‚óè Gravidez e contracep√ß√£o  \n‚óè S√≠ndrome do Anticorpo Antifosfolip√≠deo (SAF)  \n2. Esclerodermia  \n‚óè Introdu√ß√£o  \n‚óè Classifica√ß√£o  \n‚óã Forma localizada  \n‚óã Forma sist√™mica  \n‚ñ† Cut√¢nea difusa  \n‚ñ† Cut√¢nea limitada - CREST  \n‚ñ† Visceral  \n‚óè Quadro cl√≠nico  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n3. S√≠ndrome de Sj√∂gren  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óè Exames laboratoriais  \n \n \nJ√©ssica Nicolau - 2 ‚óè Diagn√≥stico  \n‚óè Tratamento  \n4. Vasculites  \n‚óè Introdu√ß√£o  \n‚óè Vasculites de grandes vasos  \n‚óã Arterite Temporal  \n‚óã Arterite de Takayasu  \n‚óè Vasculites de m√©dios v

In [4]:
for i in que:
    print(i)

page_content='J√©ssica Nicolau - 1 REUMATOLOGIA  \n \n_________________________________________________________________________________  \n \nT√ìPICOS  \n1. L√∫pus Eritematoso Sist√™mico  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óã Manifesta√ß√µes sist√™micas  \n‚óã Manifesta√ß√µes laboratoriais  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n‚óè Gravidez e contracep√ß√£o  \n‚óè S√≠ndrome do Anticorpo Antifosfolip√≠deo (SAF)  \n2. Esclerodermia  \n‚óè Introdu√ß√£o  \n‚óè Classifica√ß√£o  \n‚óã Forma localizada  \n‚óã Forma sist√™mica  \n‚ñ† Cut√¢nea difusa  \n‚ñ† Cut√¢nea limitada - CREST  \n‚ñ† Visceral  \n‚óè Quadro cl√≠nico  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n3. S√≠ndrome de Sj√∂gren  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óè Exames laboratoriais  \n \n \nJ√©ssica Nicolau - 2 ‚óè Diagn√≥stico  \n‚óè Tratamento  \n4. Vasculites  \n‚óè Introdu√ß√£o  \n‚óè Vasculites de grandes vasos  \n‚óã Arterite Temporal  \n‚óã Arterite de Takayasu  \n‚óè Vasculites de m√©dios vasos  \n‚ó

In [5]:
ans

[Document(page_content='J√©ssica Nicolau - 1 REUMATOLOGIA  \n \n_________________________________________________________________________________  \n \nT√ìPICOS  \n1. L√∫pus Eritematoso Sist√™mico  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óã Manifesta√ß√µes sist√™micas  \n‚óã Manifesta√ß√µes laboratoriais  \n‚óè Diagn√≥stico  \n‚óè Tratamento'),
 Document(page_content='‚óè Tratamento  \n‚óè Gravidez e contracep√ß√£o  \n‚óè S√≠ndrome do Anticorpo Antifosfolip√≠deo (SAF)  \n2. Esclerodermia  \n‚óè Introdu√ß√£o  \n‚óè Classifica√ß√£o  \n‚óã Forma localizada  \n‚óã Forma sist√™mica  \n‚ñ† Cut√¢nea difusa  \n‚ñ† Cut√¢nea limitada - CREST  \n‚ñ† Visceral  \n‚óè Quadro cl√≠nico  \n‚óè Diagn√≥stico  \n‚óè Tratamento'),
 Document(page_content='‚óè Tratamento  \n3. S√≠ndrome de Sj√∂gren  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óè Exames laboratoriais  \n \n \nJ√©ssica Nicolau - 2 ‚óè Diagn√≥stico  \n‚óè Tratamento  \n4. Vasculites  \n‚óè Introdu√ß√£o  \n‚óè Vasculites de grandes vasos

In [6]:
loader = PyPDFLoader(file_path)
data = loader.load()
data[:2]

[Document(page_content=' \nJ√©ssica Nicolau - 1 REUMATOLOGIA  \n \n_________________________________________________________________________________  \n \nT√ìPICOS  \n1. L√∫pus Eritematoso Sist√™mico  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óã Manifesta√ß√µes sist√™micas  \n‚óã Manifesta√ß√µes laboratoriais  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n‚óè Gravidez e contracep√ß√£o  \n‚óè S√≠ndrome do Anticorpo Antifosfolip√≠deo (SAF)  \n2. Esclerodermia  \n‚óè Introdu√ß√£o  \n‚óè Classifica√ß√£o  \n‚óã Forma localizada  \n‚óã Forma sist√™mica  \n‚ñ† Cut√¢nea difusa  \n‚ñ† Cut√¢nea limitada - CREST  \n‚ñ† Visceral  \n‚óè Quadro cl√≠nico  \n‚óè Diagn√≥stico  \n‚óè Tratamento  \n3. S√≠ndrome de Sj√∂gren  \n‚óè Introdu√ß√£o  \n‚óè Quadro cl√≠nico  \n‚óè Exames laboratoriais  \n \n', metadata={'source': '//home/rps/Downloads/REUMATOLOGIA.pdf', 'page': 0}),
 Document(page_content=' \nJ√©ssica Nicolau - 2 ‚óè Diagn√≥stico  \n‚óè Tratamento  \n4. Vasculites  \n‚óè Introdu√ß√£o  \n‚óè Vasculi

In [7]:
# file_path = '//home/rps/Downloads/REUMATOLOGIA.pdf'

# document_ques_gen, document_answer_gen = file_processing(file_path)

# #llm_ques_gen_pipeline = load_llm()

# prompt_template = """
# You are an expert at creating questions based in medicine materials and documentation.
# Your goal is to prepare a very good material for their exam and  tests.
# You do this by asking questions about the text below:

# ------------
# {text}
# ------------

# Create questions that will prepare medicine students for their tests.
# Make sure not to lose any important information.

# QUESTIONS:
# """

# PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

# refine_template = ("""
# You are an expert at creating practice questions based on medicine and books.
# Your goal is to help to prepare medicine students for a test.
# We have received some practice questions to a certain extent: {existing_answer}.
# We have the option to refine the existing questions or add new ones.
# (only if necessary) with some more context below.
# ------------
# {text}
# ------------

# Given the new context, refine the original questions in Portuguese.
# If the context is not helpful, please provide the original questions.
# QUESTIONS:
# """
# )

# REFINE_PROMPT_QUESTIONS = PromptTemplate(
#     input_variables=["existing_answer", "text"],
#     template=refine_template,
# )

# ques_gen_chain = load_summarize_chain(llm = llm, 
#                                         chain_type = "refine",
#                                         verbose = True, 
#                                         question_prompt=PROMPT_QUESTIONS, 
#                                         refine_prompt=REFINE_PROMPT_QUESTIONS)

# ques_l = [ ques_gen_chain.run( [q]) for q in document_ques_gen[:2]]
# ques = "\n".join(ques_l).split("\n")

In [8]:
# [element for element in ques if element.endswith('?') or element.endswith('.')]

In [9]:
ques, vector_store = pipe_vector(file_path)



  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert at creating questions based in medicine materials and documentation.
    Your goal is to prepare a very good material for their exam and  tests.
    You do this by asking questions about the text below:

    ------------
    J√©ssica Nicolau - 1 REUMATOLOGIA  
 
_________________________________________________________________________________  
 
T√ìPICOS  
1. L√∫pus Eritematoso Sist√™mico  
‚óè Introdu√ß√£o  
‚óè Quadro cl√≠nico  
‚óã Manifesta√ß√µes sist√™micas  
‚óã Manifesta√ß√µes laboratoriais  
‚óè Diagn√≥stico  
‚óè Tratamento  
‚óè Gravidez e contracep√ß√£o  
‚óè S√≠ndrome do Anticorpo Antifosfolip√≠deo (SAF)  
2. Esclerodermia  
‚óè Introdu√ß√£o  
‚óè Classifica√ß√£o  
‚óã Forma localizada  
‚óã Forma sist√™mica  
‚ñ† Cut√¢nea difusa  
‚ñ† Cut√¢nea limitada - CREST  
‚ñ† Visceral  
‚óè Quadro cl√≠nico  
‚óè Diagn√≥stico 

In [11]:
len(ques)

2015

In [12]:
answer_generation_chain, ques_list = pipe_gen(vector_store=vector_store, ques= ques)
base_folder = 'static/output/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder+"QAxy.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Question", "Helpful Answer"])  # Writing the header row
    #answer0=[]
    for question in ques_list:
        print("Question: ", question)
        answer = answer_generation_chain.run(question)
        #answer0.append(answer)
        print("Answer: ", answer)
        print("--------------------------------------------------\n\n")

        # Save answer to CSV file
        csv_writer.writerow([question, answer])
# 211 min

Question:  1. What are the topics covered in this material?
Answer:   The material covers concepts related to joint infection, clinical presentation of acute monoarthritis, PAN syndrome, cutaneous vasculitis, renal insufficiency and renovascular HAS, osteophyte formation, affected joints, extra-intestinal manifestations, and treatment guidelines for not prescribing antibiotics.
--------------------------------------------------


Question:      2. What is L√∫pus Eritematoso Sist√™mico and what are its manifestations?
Answer:    L√∫pus Eritematoso Sist√™mico (LES) is a chronic autoimmune disease that can affect various parts of the body, including the skin, joints, kidneys, blood cells, heart, and brain. Its manifestations include rash malar (a butterfly-shaped rash across the cheeks and nose), photosensitivity (increased sensitivity to sunlight), l√∫pus discoide (eritematous and desquamative lesions similar to psoriasis), non-evolving with systemic LES, anti-Ro positive, vasculite l√∫p

In [16]:
num_documents = len(vector_store.index_to_docstore_id)
print(f"Total number of documents: {num_documents}")

vector_store.save_local("faiss_index")

Total number of documents: 525


In [18]:
query = "como tratar artrite septica?"
docs = vector_store.similarity_search(query)
docs

[Document(page_content='apenas se d√∫vida quanto a possibilidade de artrite s√©ptica!  \n‚óè RX com achado t√≠pico de les√£o em saca -bocado ! \n \n  \n \n- Tratamento : \n‚óè Artrite Gotosa Aguda:  \n‚óã N√ÉO FAZER AAS E N√ÉO MEXER NAS DOSES DAS DROGAS HIPOURICEMIANTES \nNESSE MOMENTO, POIS TAIS MEDICAMENTOS VARIAM ABRUPTAMENTE OS'),
 Document(page_content='‚óè Tratar infec√ß√£o, se presente  \n \n \n \n \nJ√©ssica Nicolau - 54 5) ARTRITE PSORI√ÅSICA  \n- Ser√° abordada no cap√≠tulo de " Espondiloartrites"  \n‚óè Formas de artrite:  \n‚óã Oligoartrite assim√©trica (semelhante √† S√≠ndrome de Reiter)  \n‚óã Poliartrite sim√©trica (semelhante a AR)'),
 Document(page_content='‚óã Ciprofloxacino 500 mg 12/12h at√© completar 7 dias de tratamento  \n‚óã Lavagem por artroscopia raramente √© necess√°ria!  \n‚óè N√£o Gonoc√≥cica:  \n‚óã Antibioticoterapia:  \n‚ñ† Oxacilina  \n‚ñ† Ceftriaxone, se d√∫vida em origem gonoc√≥cica  \n‚óã Pun√ß√µes seriadas esvaziadores , pelo risco de destrui√ß√£o a

In [31]:
for question in ques_list:
        print("Question: ", question)
        answer = answer_generation_chain.run(question)
        print("Answer: ", answer)

Question:  1. Qual √© a defini√ß√£o do Transtorno de Ansiedade Geralizado (TAG)?
Answer:   The Transtorno de Ansiedade Geralizado (TAG) is characterized by a continuous and exaggerated state of anxiety, worry, and tension that lasts for at least six months.
Question:      2. Quanto da popula√ß√£o est√° afetada pelo TAG?
Answer:   Entre 5% e 10% da popula√ß√£o est√° afetada pelo TAG.

Question:     3. Qual o sexo mais afetado pelo TAG?
Helpful Answer: As mulheres s√£o de duas a tr√™s vezes mais afetadas do que o sexo masculino.

Question:     4. Quais os principais tratamentos para o TAG?
Helpful Answer: O melhor tratamento √© a combina√ß√£o de m√©todos, incluindo psicoterapia cognitivo-comportamental.

Question:     5. Qual a idade em que o TAG geralmente come√ßa?
Helpful Answer: O TAG normalmente se inicia na adolesc√™ncia.

Question:     6. Quanto da popula√ß√£o est√° afetada pelo transtorno de ansiedade social (TAS)?
Helpful Answer: O TAS acomete cerca 13% da popula√ß√£o.

Question:

In [28]:
ques_list

['1. Qual √© a defini√ß√£o do Transtorno de Ansiedade Geralizado (TAG)?',
 '    2. Quanto da popula√ß√£o est√° afetada pelo TAG?',
 '    3. Homens ou mulheres s√£o mais propensos a serem afetados pelo TAG?',
 '    4. A que idade o TAG geralmente come√ßa?',
 '    5. Como √© tratado o TAG?',
 '    6. Quais medicamentos s√£o considerados como primeira linha de tratamento para o TAG?',
 '    7. Qual √© a prognose para pacientes com TAG que n√£o recebem tratamento adequado?.']

In [4]:
file_path = '/home/rps/Downloads/ansiedade_4_8.pdf'

output_file = get_csv(file_path)

  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert at creating questions based in medicine materials and documentation.
    Your goal is to prepare a very good material for their exam and  tests.
    You do this by asking questions about the text below:

    ------------
    Estrat√©gia
MEDPSIQUIATRIA Transtorno de Ansiedade
Prof. Thales Thaumaturgo | Resumo Estrat√©gico | 202341.0 TRANSTORNOS DE ANSIEDADE
1.1 TRANSTORNO DE ANSIEDADE GENERALIZADA (TAG)
O transtorno de ansiedade generalizada (TAG)  √© caracterizado por um estado cont√≠nuo e exagerado de ansiedade, preocupa√ß√£o e 
tens√£o que dura pelo menos 6 meses. 
1.1.1 EPIDEMIOLOGIA, CURSO E PROGN√ìSTICO
O TAG afeta entre 5% e 10% da popula√ß√£o e as mulheres s√£o de duas a tr√™s vezes mais afetadas do que o sexo masculino. Esse 
transtorno normalmente se inicia na adolesc√™ncia, tem curso cr√¥nico e flutuante, que se mistura