In [1]:
!pip install langchain==0.0.200
!pip install chromadb==0.3.26
!pip install pypdf==3.9.1
!pip install sentence-transformers==2.2.2
!pip install pandas==2.0.3
!pip install tiktoken
!pip install openai==0.28.1



In [2]:
import os
import pandas as pd
import chromadb
import openai

from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
def load_data_file(FILE_NAME,docs_folder):

    # Load the PDF file and split it into smaller chunks
    file_path = os.path.join(docs_folder,
                            FILE_NAME)
    print(file_path)
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    selected_pages = pages[19:54]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,chunk_overlap=100)
    chunks = text_splitter.split_documents(selected_pages)

    return chunks

In [4]:
def embedding_model():
    # Retrieve embedding function from code env resources
    emb_model = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=emb_model,
        cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME')
    )

    return emb_model, embeddings

In [5]:
def save_vector_db(vector_db_folder,vector_db_name, chunks, embeddings):

    vector_db_path = os.path.join(vector_db_folder,
                              vector_db_name)
    db = Chroma.from_documents(chunks,
                           embedding=embeddings,
                           metadatas=[{"source": f"{i}-wb23"} for i in range(len(chunks))],
                           persist_directory=vector_db_path)

    # Save vector database as persistent files in the output folder
    db.persist()

In [6]:
docs_folder = '/content/drive/MyDrive/ColabNotebooks/data/'
FILE_NAME = 'ConceptsofBiology-WEB.pdf'
vector_db_folder = './vector_db'
vector_db_name = 'concepts_of_biology'
#chunks = load_data_file(FILE_NAME,docs_folder)
emb_model, embeddings = embedding_model()
#save_vector_db(vector_db_folder,vector_db_name, chunks, embeddings)

In [7]:
def load_vector_db(vector_db_name, vector_db_folder, embeddings):
    # Load vector database

    persist_dir = os.path.join(vector_db_folder, vector_db_name)
    vector_db = Chroma(persist_directory=persist_dir, embedding_function=embeddings)

    return vector_db

In [8]:
vector_db = load_vector_db(vector_db_name, vector_db_folder, embeddings)

In [9]:
# Import Colab Secrets userdata module
from google.colab import userdata

# Set other API keys similarly
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [10]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")

In [18]:
prompt_template = """Use the following pieces of context to answer the question at the end.

{context}

Question: {question}

Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)


In [12]:
query = 'What are the major topics discussed in the data'
#retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [13]:
q = query
v = vector_db.similarity_search(q, k=4,include_metadata=True)
print(v)
# Run the chain by passing the output of the similarity search
chain = load_qa_chain(llm, chain_type="stuff")
res = chain({"input_documents": v, "question": q})
output = res["output_text"]

[Document(page_content='records them. These data can be qualitativ e (descrip tive) or quantitativ e (consis ting o f numbers), and the r aw data\ncan be supplement ed with dr awings , pictur es, phot os, or videos . From man y obser vations , the scientis t can inf er\nconclusions (inductions) based on e videnc e. Inductiv e reasoning in volves formulating g ener alizations inf erred fr om', metadata={'source': '/content/drive/MyDrive/ColabNotebooks/data/ConceptsofBiology-WEB.pdf', 'page': 32}), Document(page_content='deposit ed in v arious databases . Using c omput er alg orithms and s tatis tical anal yses o f data in databases , a ne w field\nof so-called " data r esear ch" (also r eferred to as "in silic o" resear ch) pr ovides ne w methods o f data anal yses and\ntheir int erpr etation. This wil l incr ease the demand f or specialis ts in both biolog y and c omput er scienc e, a pr omising', metadata={'source': '/content/drive/MyDrive/ColabNotebooks/data/ConceptsofBiology-WEB.pdf



RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [None]:
print(output)