In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [2]:
PINECONE_API_KEY = "ece10b1f-309d-40de-9575-d8c3e81ef022"
PINECONE_API_ENV = "gcp-starter"

In [3]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [4]:
# extracted_data
extracted_data = load_pdf("data/")

In [5]:
extracted_data[:2]

[Document(page_content='Applied \nGenerative AI for \nBeginners\nPractical Knowledge on Diffusion Models, \nChatGPT, and Other LLMs\n—\nAkshay Kulkarni\nAdarsha Shivananda\nAnoosh Kulkarni\nDilip Gudivada', metadata={'source': 'data\\Applied Generative Ai for Beginners Notes.pdf', 'page': 0}),
 Document(page_content='Applied Generative AI for \nBeginners\nPractical Knowledge on\xa0Diffusion \nModels, ChatGPT, and\xa0Other LLMs\nAkshay\xa0Kulkarni\nAdarsha\xa0Shivananda\nAnoosh\xa0Kulkarni\nDilip\xa0Gudivada', metadata={'source': 'data\\Applied Generative Ai for Beginners Notes.pdf', 'page': 1})]

In [6]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 856


In [8]:
text_chunks[:25]

[Document(page_content='Applied \nGenerative AI for \nBeginners\nPractical Knowledge on Diffusion Models, \nChatGPT, and Other LLMs\n—\nAkshay Kulkarni\nAdarsha Shivananda\nAnoosh Kulkarni\nDilip Gudivada', metadata={'source': 'data\\Applied Generative Ai for Beginners Notes.pdf', 'page': 0}),
 Document(page_content='Applied Generative AI for \nBeginners\nPractical Knowledge on\xa0Diffusion \nModels, ChatGPT, and\xa0Other LLMs\nAkshay\xa0Kulkarni\nAdarsha\xa0Shivananda\nAnoosh\xa0Kulkarni\nDilip\xa0Gudivada', metadata={'source': 'data\\Applied Generative Ai for Beginners Notes.pdf', 'page': 1}),
 Document(page_content='Applied Generative AI for Beginners: Practical Knowledge on Diffusion Models, \nChatGPT, and Other LLMs\nISBN-13 (pbk): 978-1-4842-9993-7   ISBN-13 (electronic): 978-1-4842-9994-4\nhttps://doi.org/10.1007/978-1-4842-9994-4\nCopyright © 2023 by Akshay Kulkarni, Adarsha Shivananda, Anoosh Kulkarni,  \nDilip Gudivada\nThis work is subject to copyright. All rights are reserv

In [9]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
# vector representation
query_result

[-0.034477267414331436,
 0.031023206189274788,
 0.006734929047524929,
 0.026108982041478157,
 -0.03936203196644783,
 -0.16030243039131165,
 0.06692398339509964,
 -0.006441489793360233,
 -0.04745049402117729,
 0.014758839271962643,
 0.07087527960538864,
 0.05552763119339943,
 0.01919335499405861,
 -0.026251323521137238,
 -0.010109569877386093,
 -0.02694045566022396,
 0.022307392209768295,
 -0.02222662791609764,
 -0.14969263970851898,
 -0.017493031919002533,
 0.007676294539123774,
 0.054352253675460815,
 0.0032544503919780254,
 0.031725917011499405,
 -0.08462144434452057,
 -0.029405998066067696,
 0.051595620810985565,
 0.048124048858881,
 -0.00331486901268363,
 -0.058279212564229965,
 0.041969265788793564,
 0.022210726514458656,
 0.12818878889083862,
 -0.02233896590769291,
 -0.011656217277050018,
 0.06292834132909775,
 -0.032876402139663696,
 -0.09122605621814728,
 -0.03117532841861248,
 0.05269954353570938,
 0.047034818679094315,
 -0.08420311659574509,
 -0.030056139454245567,
 -0.020744

In [14]:
#pip install -q --upgrade pinecone-client==2.2.4

In [15]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="aichatbot"

#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

### **If we already have an index in pinecone, so we can directly load it like this**

In [16]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "Explain about LLMs using sklearn"

docs=docsearch.similarity_search(query, k=2)

print("Result", docs)

Result [Document(page_content='of tools.\nIn essence, Scikit-LLM represents a powerful synergy between state-of-the-art \nlanguage understanding and the analytical prowess of scikit-learn, enabling you to \nextract invaluable insights from text data that were once hidden in plain sight. It is easy \nto use and provides a range of features that make it a valuable resource for data scientists \nand machine learning practitioners.\nHere are some additional details about the features of Scikit-LLM:', metadata={}), Document(page_content='of tools.\nIn essence, Scikit-LLM represents a powerful synergy between state-of-the-art \nlanguage understanding and the analytical prowess of scikit-learn, enabling you to \nextract invaluable insights from text data that were once hidden in plain sight. It is easy \nto use and provides a range of features that make it a valuable resource for data scientists \nand machine learning practitioners.\nHere are some additional details about the features of Scik

In [17]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [18]:
PROMPT=PromptTemplate(template=prompt_template, 
                      input_variables=["context", "question"])

chain_type_kwargs={"prompt": PROMPT}

In [20]:
llm=CTransformers(model="C:\Users\NAVYA\Downloads\MY_PYTHON_PRACTICE\Module_4\testingopenai\Medical_Chatbot_Using_Llama2\model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.6})

RuntimeError: Failed to create LLM 'llama' from 'model\llama-2-7b-chat.ggmlv3.q2_K.bin'.

In [None]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [None]:
user_input=input(f"Input Prompt:")
result=qa({"query": user_input})
print("Response : ", result["result"])

Response :  Scikit-LLM is a library that combines natural language processing (NLP) techniques with scikit-learn's machine learning algorithms to extract insights from text data. It provides various features such as tokenization, stemming, lemmatization, named entity recognition, sentiment analysis, and topic modeling. Additionally, it allows users to train custom models using their own preprocessed text data.


In [22]:
# while True:
#     user_input=input(f"Input Prompt:") 
#     result=qa({"query": user_input})
#     print("Response : ", result["result"])