In [33]:
import os
import openai
import sys
import tiktoken

#Retrivers
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.llms import OpenAI

from dotenv import dotenv_values
secret=dotenv_values(".env")
secret["OPENAI_API_KEY"]


openai.api_key  = secret["OPENAI_API_KEY"]


In [14]:
openai.api_key, secret['PINCONE_API_KEY']


('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',
 '9b7203fb-987c-4a1a-8084-4e4606af6450')

### Loading the Data

In [3]:
import glob
from langchain.document_loaders import PyPDFLoader


pdf_paths = glob.glob("data/*.pdf")

# Replace double backslashes with single backslashes and print
loaders=[]
for pdf_path in pdf_paths:
   loaders.append(PyPDFLoader(pdf_path.replace("\\", "/")))
doc=[]
for loader in loaders:
    doc.extend(loader.load())

len(doc)

728

In [4]:
doc[710].metadata

{'source': 'data/XGBoost with Python Gradient Boosted Trees with XGBoost and scikit-learn (Jason Brownlee) (z-lib.org).pdf',
 'page': 97}

In [5]:
no_pages=len(doc)
print(f"No of pages in the document {no_pages}")

first_pg=doc[310].page_content[0:1000] # First 100 char of first 1 page
print (f"First 100 char of first 1 page: {first_pg}")

No of pages in the document 728
First 100 char of first 1 page: 7.6 Local Regression 303
20 30 40 50 60 70 800 50 100 200 300AgeWageSmoothing Spline16 Degrees of Freedom6.8 Degrees of Freedom (LOOCV)
FIGURE 7.8. Smoothing spline fits to the Wagedata. The red curve results
from specifying 16effective degrees of freedom. For the blue curve, λwas found
automatically by leave-one-out cross-validation, which resulted in 6.8effective
degrees of freedom.
is preferable, since in general simpler models are better unless the data
provides evidence in support of a more complex model.
7.6 Local Regression
Local regression is a different approach for fitting flexible non-linear func-local
regressiontions, which involves computing the fit at a target point x0using only the
nearby training observations. Figure 7.9illustrates the idea on some simu-
lated data, with one target point near 0.4, and another near the boundary
at0.05. In this figure the blue line represents the function f(x)from which
the d

### Splitting the Data

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size=1500
chunk_overlap=200

r_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
    )

splits=r_splitter.split_documents(doc)
len(splits)

1512

### Embedding & Vector Store

In [None]:
# from langchain.vectorstores import Chroma
# from langchain.embeddings.openai import OpenAIEmbeddings
# persist_directory = 'data/chroma/'
# embedding = OpenAIEmbeddings()
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=embedding,
#     persist_directory=persist_directory
# )
# print(vectordb._collection.count())


In [23]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
embedding = OpenAIEmbeddings()
#initiaize pinecon
pinecone.init(
    api_key=secret['PINCONE_API_KEY'],
    environment=secret['PINCONE_ENV']
    
    
)
index_name= 'multi-source-qna'


# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
vectordb = Pinecone.from_documents(splits, embedding, index_name=index_name)

# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [28]:
#Test
query=" What is linear regression ?"
docs=vectordb.similarity_search(query,k=5)
for doc in docs:
    print(doc.metadata)

{'page': 77.0, 'source': 'data/ISLP_website.pdf'}
{'page': 82.0, 'source': 'data/ISLP_website.pdf'}
{'page': 142.0, 'source': 'data/ISLP_website.pdf'}
{'page': 126.0, 'source': 'data/ISLP_website.pdf'}
{'page': 98.0, 'source': 'data/ISLP_website.pdf'}


In [37]:
# use max_marginal_relevance_search directly:
# found_docs = vectordb.max_marginal_relevance_search(query, k=2, fetch_k=10)
# for i, doc in enumerate(found_docs):
#     print(f"{i + 1}.", doc.page_content, "\n")

# Using Compression retrivevel technique
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

query=" What is xgboost tips and tricks ?"
compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

Document 1:

An introduction to XGBoost parameters and heuristics for good parameter values. How to tune the number and size of trees in a model. How to tune the learning rate and number of trees in a model. How to tune the sampling rates in stochastic variation of the algorithm.
----------------------------------------------------------------------------------------------------
Document 2:

"XGBoost is an algorithm that has recently been dominating applied machine learning and Kaggle competitions for structured or tabular data. XGBoost is an implementation of gradient boosted decision trees designed for speed and performance."
