In [1]:
import os
import openai
import sys
import tiktoken
#Reading doc file
from langchain.document_loaders import Docx2txtLoader
#Reading ppt file
from langchain.document_loaders import UnstructuredPowerPointLoader
# Reading pdf
from langchain.document_loaders import PyPDFLoader
# # Reading jpg
# from langchain.document_loaders.image import UnstructuredImageLoader



#Retrivers
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.llms import OpenAI
import glob


from dotenv import dotenv_values
secret=dotenv_values(".env")
secret["OPENAI_API_KEY"]



openai.api_key  = secret["OPENAI_API_KEY"]


### Loading the Data

In [4]:


paths_pdf = glob.glob("data/*.pdf")
paths_word = glob.glob("data/*.docx")
paths_ppt = glob.glob("data/*.pptx")
paths_jpg=glob.glob("data/*.jpg")

#**********************##\
loaders=[]
doc=[]

for pdf_path in paths_pdf:
       loaders.append(PyPDFLoader(pdf_path.replace("\\", "/")))
for word_path in paths_word:
       loaders.append(Docx2txtLoader(word_path.replace("\\", "/")))
for ppt_path in paths_ppt:
    loaders.append(UnstructuredPowerPointLoader(ppt_path.replace("\\", "/")))
# for jpg_path in paths_jpg:
#     loaders.append(UnstructuredImageLoader(jpg_path.replace("\\", "/")))
    
for loader in loaders:
    doc.extend(loader.load())

len(doc)



283

In [5]:
len(doc)

283

In [7]:
loaders

[<langchain.document_loaders.pdf.PyPDFLoader at 0x12b369f1f10>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x12b2c70e6d0>,
 <langchain.document_loaders.word_document.Docx2txtLoader at 0x12b369f1810>,
 <langchain.document_loaders.powerpoint.UnstructuredPowerPointLoader at 0x12b369f2150>]

In [8]:
doc[1].metadata

{'source': 'data/SQLNotesForProfessionals.pdf', 'page': 1}

In [9]:
no_pages=len(doc)
print(f"No of pages in the document {no_pages}")

first_pg=doc[280].page_content[0:1000] # First 100 char of first 1 page
print (f"First 100 char of first 1 page: {first_pg}")

No of pages in the document 283
First 100 char of first 1 page: 18.3. Python Machine Learning 109
Section 8.6 Boosting, page 203 and Section 14.5 Stochastic Gradient Boosting, page 390,
inApplied Predictive Modeling .
http://www.amazon.com/dp/1461468485?tag=inspiredalgor-20
Section 16.4 Boosting, page 556, Machine Learning: A Probabilistic Perspective .
http://www.amazon.com/dp/0262018020?tag=inspiredalgor-20
Chapter 10 Boosting and Additive Trees, page 337, The Elements of Statistical Learning:
Data Mining, Inference, and Prediction .
http://www.amazon.com/dp/0387848576?tag=inspiredalgor-20
18.3 Python Machine Learning
Python is a growing platform for applied machine learning. The strong attraction is because
Python is a fully featured programming language (unlike R) and as such you can use the same
code and libraries in developing your model as you use to deploy the model into operations.
The premier machine learning library in Python is scikit-learn built on top of SciPy.
Visit 

### Splitting the Data

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size=1500
chunk_overlap=200

r_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
    )

splits=r_splitter.split_documents(doc)
len(splits)

468

### Embedding & Vector Store

In [11]:
#USE BELOW CODE ONLY WHILE CREATING A NEW VECTOR STORE.

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
embedding = OpenAIEmbeddings()
#initiaize pinecon
pinecone.init(
    api_key=secret['PINCONE_API_KEY'],
    environment=secret['PINCONE_ENV']
    
    
)
index_name= 'multi-source-qna'


# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
vectordb = Pinecone.from_documents(splits, embedding, index_name=index_name)

# if you already have an index, you can load it like thisa
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

  from tqdm.autonotebook import tqdm


In [12]:
#Test
query=" What is linear regression and How to alter table in SQL?"
docs=vectordb.similarity_search(query,k=5)
for doc in docs:
    print(doc.metadata)

{'page': 92.0, 'source': 'data/SQLNotesForProfessionals.pdf'}
{'page': 8.0, 'source': 'data/SQLNotesForProfessionals.pdf'}
{'page': 152.0, 'source': 'data/SQLNotesForProfessionals.pdf'}
{'page': 1.0, 'source': 'data/SQLNotesForProfessionals.pdf'}
{'page': 3.0, 'source': 'data/SQLNotesForProfessionals.pdf'}


In [17]:
# use max_marginal_relevance_search directly:
# found_docs = vectordb.max_marginal_relevance_search(query, k=2, fetch_k=10)
# for i, doc in enumerate(found_docs):
#     print(f"{i + 1}.", doc.page_content, "\n")

# Using Compression retrivevel technique
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

# query=" What is xgboost tips and tricks ?"
query="What is black hole?"

compressed_docs = compression_retriever.get_relevant_documents(query,k=5)
pretty_print_docs(compressed_docs)





### QnA gpt model intialization

In [31]:
from langchain.chat_models import ChatOpenAI
llm=ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

# Import retrieval QA chain
from langchain.chains import RetrievalQA

qa_chain= RetrievalQA.from_chain_type(llm,
                                      retriever=compression_retriever)

query=" What is AI governance?"

result=qa_chain({'query':query})

result['result']

'AI governance refers to the set of policies, regulations, and ethical frameworks that are put in place to guide the development, deployment, and use of artificial intelligence (AI) technologies. It involves addressing issues such as accountability, transparency, fairness, privacy, and security in AI systems. AI governance aims to ensure that AI is developed and used in a responsible and beneficial manner, while also mitigating potential risks and negative impacts.'

### Adding Promttemplate

In [32]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know,
don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

QA_Chain_Prompt=PromptTemplate.from_template(template)

In [39]:
#Running Chain again
qa_chain=RetrievalQA.from_chain_type(llm,
                            retriever=compression_retriever,
                            return_source_documents=True, # this will let us ins pect document we retrieve 
                            chain_type_kwargs={"prompt":QA_Chain_Prompt}
                            
)


query="What are different AIML strategies ?"

# result=qa_chain({'query':query})
result=qa_chain({'query':query})


result['result']



'Different AIML strategies include building a continuous learning program for employees, defining metrics, measuring and analyzing data, identifying areas for improvement, and incorporating feedback. Thanks for asking!'

Retriever QA chain using Map reduce

In [37]:
qa_chain_mr=RetrievalQA.from_chain_type(llm,
                                        retriever=compression_retriever,
                                        chain_type="map_reduce")
query=" What are different AIML strategies ?"

# result=qa_chain({'query':query})
result=qa_chain_mr({'query':query})


result['result']



{'query': ' What are different AIML strategies ?',
 'result': "I'm sorry, but the given portion of the document does not provide any information about different AIML strategies."}

In [44]:
# CHain type refine

qa_chain_mr=RetrievalQA.from_chain_type(llm,
                                        retriever=compression_retriever,
                                        chain_type="refine")
query=" What are different AIML strategies ?"

result=qa_chain_mr({'query':query})


result['result']

'In addition to the strategies mentioned earlier, when implementing AIML, it is important to consider the available tools and technologies. Here are some additional considerations:\n\n1. Research available AIML tools and technologies: There are numerous AIML tools and frameworks available, such as TensorFlow, PyTorch, scikit-learn, and IBM Watson. Researching and understanding the features, capabilities, and limitations of these tools can help in selecting the most suitable one for your specific needs.\n\n2. Choose the right tool or technology: Consider factors like the complexity of your problem, the size of your dataset, the scalability requirements, and the programming language you are comfortable with. Each tool or technology may have its own strengths and weaknesses, so choose the one that aligns with your requirements.\n\n3. Consider the learning curve: Some AIML tools and technologies may have a steeper learning curve than others. Evaluate the complexity of the tool and the avai

Refine chain type qa chain seems to be working good compared to rest of the technique

### CHATBOT- Adding Memrory for the conversation

In [41]:
## MEMORY
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

### ConversationalRetrievalChain

In [50]:
from langchain.chains import ConversationalRetrievalChain
retriever=compression_retriever
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    chain_type='refine',

)

query=" What are different AIML strategies ?"

result=qa({'question':query})


result

{'question': ' What are different AIML strategies ?',
 'chat_history': [HumanMessage(content=' What are different AIML strategies ?', additional_kwargs={}, example=False),
  AIMessage(content="In addition to the strategies mentioned earlier, there are some additional considerations when implementing AIML:\n\n1. Research available AIML tools and technologies: It is important to explore the various tools and technologies available in the market for AIML implementation. This includes frameworks, libraries, and platforms that can support your specific requirements.\n\n2. Choose the right tool or technology: Once you have researched the available options, carefully evaluate and choose the tool or technology that best aligns with your organization's needs. Consider factors such as scalability, compatibility with existing systems, ease of integration, and the specific AIML capabilities offered.\n\n3. Consider the learning curve: Implementing AIML may require learning new technologies and conc

In [48]:
query="Can you explain any one of the strategy?"
result=qa({'question':query})


result['answer']



"The additional context provided is indeed useful for refining the original answer. Here's an updated response:\n\nOne of the AIML strategies mentioned in the context is defining metrics. Defining metrics is crucial for evaluating the success and effectiveness of an AI or machine learning project. However, before defining metrics, it is important to consider several factors related to data management and governance.\n\nFirstly, organizations need to identify all the sources of data they currently possess and assess their quality and relevance to the AIML objective. This includes structured data from databases and spreadsheets, as well as unstructured data from sources like social media, customer feedback, and sensor data.\n\nData quality plays a critical role in AIML projects. It is essential to evaluate the accuracy, completeness, consistency, and timeliness of the data, while also identifying any gaps or inconsistencies. Data cleansing, normalization, and transformation may be necess