In [8]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers


In [40]:
# PINECONE Information for connection
API_KEY = "1447d76e-5c29-49bb-9fce-5cc8642a28d6"
API_ENV = "gcp-starter"
index_name="prabal-knowledge-management-chatbot"


In [19]:
#Extract data
class Load_data():
    def __init__(self,data):
        self.data= data

    def load_pdf_data(self):
        loader = DirectoryLoader(self.data,
                        glob="*.pdf",
                        loader_cls=PyPDFLoader)
        
        documents = loader.load()

        return documents

In [20]:
# creating an object for the load_data class
loaded_data_object = Load_data("/Users/praba/Documents/GitHub/wikigpt-uca/data/")
loaded_data = loaded_data_object.load_pdf_data()

In [30]:
#Create text chunks
class Split_text():
    def __init__(self, loaded_data):
        self.loaded_data = loaded_data
    def text_split(self):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300, chunk_overlap = 10)
        chunks = text_splitter.split_documents(self.loaded_data)

        return chunks

In [31]:
chunks_object = Split_text(loaded_data)
text_chunks =chunks_object.text_split()

In [34]:
print("number of chunks:", len(text_chunks))

number of chunks: 28


In [35]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [38]:
text_embeddings = download_hugging_face_embeddings()

In [39]:
print(text_embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={}


In [51]:
#Initializing the Pinecone
pc =pinecone.init(api_key=API_KEY,
              environment=API_ENV)

In [45]:
list_chunks = [text.page_content for text in text_chunks]
print(list_chunks[:5])

['Employee Profile: Alex Thompson  \nIntroduction:  \n• Name:  Alex Thompson  \n• Position:  Machine Learning Engineer  \n• Department:  AI Research  \n• Joining Date:  March 15, 2018  \nProfessional Background:  \n• Education:  \n• MSc in Computer Science, Stanford University, 2017', '• BEng in Electrical Engineering, MIT, 2015  \n• Work Experience:  \n• Software Engineer, Google Research, 2017 -2018  \n• Intern, Apple AI Labs, Summer 2016  \nSkills and Expertise:  \n• Machine Learning, Deep Learning, Computer Vision  \nProjects:', '• Developed a real -time object detection system using convolutional neural networks (CNNs).  \n• Implemented a recommendation engine for personalized content delivery.  \nAchievements:  \n• Recognized as "Employee of the Year" in 2019 for outstanding contributions to project success.', 'Publications and Contributions:  \n• Co-authored three research papers on computer vision and machine learning, presented at top \nconferences.  \nProfessional Development

In [55]:
#Creating Embeddings for Each of The Text Chunks & storing
Pinecone.from_texts(list_chunks, text_embeddings, index_name=index_name)

<langchain.vectorstores.pinecone.Pinecone at 0x134d2c80220>

In [56]:
#If we already have an index we can load it like this
retrived_text = Pinecone.from_existing_index(index_name, text_embeddings)

query = "Who is  Alex Thompson"

docs = retrived_text.similarity_search(query, k=1)

print("Result", docs)

Result [Document(page_content='Employee Profile: Alex Thompson  \nIntroduction:  \n• Name:  Alex Thompson  \n• Position:  Machine Learning Engineer  \n• Department:  AI Research  \n• Joining Date:  March 15, 2018  \nProfessional Background:  \n• Education:  \n• MSc in Computer Science, Stanford University, 2017  \n• BEng in Electrical Engineering, MIT, 2015  \n• Work Experience:  \n• Software Engineer, Google Research, 2017 -2018  \n• Intern, Apple AI Labs, Summer 2016  \nSkills and Expertise:', metadata={})]


In [58]:
for i in docs :
    print(i.page_content)

Employee Profile: Alex Thompson  
Introduction:  
• Name:  Alex Thompson  
• Position:  Machine Learning Engineer  
• Department:  AI Research  
• Joining Date:  March 15, 2018  
Professional Background:  
• Education:  
• MSc in Computer Science, Stanford University, 2017  
• BEng in Electrical Engineering, MIT, 2015  
• Work Experience:  
• Software Engineer, Google Research, 2017 -2018  
• Intern, Apple AI Labs, Summer 2016  
Skills and Expertise:


In [59]:
prompt_template = """
Please provide the necessary details for me to assist you effectively. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Here's the format to follow:

Context: {context}
Question: {question}

Once you've provided the required information, I'll offer you the most helpful answer.

Helpful answer:
"""


In [60]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [62]:
print(chain_type_kwargs)

{'prompt': PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="\nPlease provide the necessary details for me to assist you effectively. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\nHere's the format to follow:\n\nContext: {context}\nQuestion: {question}\n\nOnce you've provided the required information, I'll offer you the most helpful answer.\n\nHelpful answer:\n", template_format='f-string', validate_template=True)}


In [68]:
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':256})

In [73]:
question_answer=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 1}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [74]:
question_answer

RetrievalQA(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, combine_documents_chain=StuffDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="\nPlease provide the necessary details for me to assist you effectively. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\nHere's the format to follow:\n\nContext: {context}\nQuestion: {question}\n\nOnce you've provided the required information, I'll offer you the most helpful answer.\n\nHelpful answer:\n", template_format='f-string', validate_template=True), llm=CTransformers(cache=None, verbose=False, callbacks=None, callback

In [75]:
user_input=input(f"Input Prompt:")

In [76]:
user_input

'Who is Alex ?'

In [77]:
result=question_answer({"query": user_input})
print("Response : ", result["result"])

Response :  Alex Thompson is a Machine Learning Engineer at AI Research department in the company. He joined the company on March 15, 2018, and his professional background includes earning an MSc in Computer Science from Stanford University in 2017 and a BEng in Electrical Engineering from MIT in 2015. His work experience includes working as a Software Engineer at Google Research from 2017 to 2018 and interning at Apple AI Labs during the summer of 2016. He has skills and expertise in machine learning, computer science, electrical engineering, and software development.
