# Indexing data

In [None]:
#!pip install langchain pypdf faiss-cpu openai tiktoken pinecone-client newsapi-python chromadb apify GitPython

Upload the pdf file and set file path

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = '/content/ESLII_print12_toc.pdf'

loader = PyPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

data = loader.load_and_split(text_splitter=text_splitter)
data

In [None]:
data[0].page_content

Provide Openai Key

In [None]:
import os
os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(show_progress_bar=True)

vector1 = embeddings.embed_query('How are you?')

len(vector1)

In [None]:
embeddings.__dict__

In [None]:
import numpy as np
from numpy.linalg import norm

def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

vector1 = embeddings.embed_query('machine learning')
vector2 = embeddings.embed_query('artificial intelligence')
cosine = get_cosine(vector1, vector2)
cosine

In [None]:
vector3 = embeddings.embed_query('peperoni pizza')
cosine = get_cosine(vector2, vector3)
cosine

In [None]:
from langchain.vectorstores import FAISS

index = FAISS.from_documents(data, embeddings)

In [None]:
index.similarity_search_with_relevance_scores(
    "What is machine learning?"
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

retriever = index.as_retriever()
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

llm = ChatOpenAI()

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

handler = StdOutCallbackHandler()

chain.run(
    'What is machine learning?',
    callbacks=[handler]
)

# Loading data into a Vector Database

need to create pinecone api key and environment details

Create an account in pinecone, then get API key and Create a new Index, give a name(index name) and Dimensions = 1536 then Metric = cosine

In [None]:
import os

os.environ["PINECONE_API_KEY"] = 'API KEY'
os.environ["PINECONE_ENV"] = 'YOUR ENV'


In [None]:


import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ.get("API KEY"))

# Now do stuff
if 'index name' not in pc.list_indexes().names():
    pc.create_index(
        name='index name',
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='google',
            region='Iowa (us-central1)'
        )
    )



In [None]:
from langchain.chains import RetrievalQA
import pinecone
from langchain.vectorstores import Pinecone



In [None]:
index_name = "index name"
db = Pinecone.from_documents(
     data,
    embeddings,
    index_name=index_name
)

In [None]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    verbose=True
)

chain.run(
    'What is machine learning?',
    callbacks=[handler]
)

# Providing sources

Create an account in https://newsapi.org/ and get a API Key

In [None]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key="NEWS API KEY")

today = date.today()
last_week = today - timedelta(days=7)

latest_news = newsapi.get_everything(
    q='artificial intelligence',
    from_param=last_week.strftime("%Y-%m-%d"),
    to=today.strftime("%Y-%m-%d"),
    sort_by='relevancy',
    language='en'
)

In [None]:
latest_news['articles']

In [None]:
from langchain.docstore.document import Document

docs = []
for article in latest_news['articles']:
    page_content = ""
    if article['title']:
        page_content += article['title'] + '\n\n'
    if article['description']:
        page_content += article['description']

    docs.append(
        Document(
            page_content=page_content,
            metadata={
                'source': article['url'],
            }
        )
    )

print(docs[0].page_content)
print(docs[0].metadata)

In [None]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

qa_chain = create_qa_with_sources_chain(llm)

doc_prompt = PromptTemplate(
    template="Content: {page_content}\nSource: {source}",
    input_variables=["page_content", "source"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index = FAISS.from_documents(docs, embedding=embeddings)


chain = RetrievalQA(
    retriever=index.as_retriever(),
    combine_documents_chain=final_qa_chain
)


In [None]:
question = """
What is the most important news about artificial intelligence from last week?
"""

answer = chain.run(question)

print(answer)

# Indexing a website

In [None]:
import os
os.environ['APIFY_API_TOKEN'] = 'YOUR APIFY TOKEN'

In [None]:
from langchain.utilities import ApifyWrapper
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
apify = ApifyWrapper()

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls": [{"url": "https://vingyani.com/"}],
        "aggressivePrune": True,
    },
    dataset_mapping_function=lambda item:Document  (
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [None]:
from langchain.indexes import VectorstoreIndexCreator

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

index = VectorstoreIndexCreator(
    text_splitter=text_splitter
).from_loaders([loader])

index

In [None]:
#query = "What is the main subject of the aiedge newsletter?"

query = "What is the main subject of the vingyani?"

index.query_with_sources(query)

In [None]:
retriever = index.vectorstore.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

#query = "What is the most recent article of the aiedge newsletter?"

query = "What is the most recent article of the vingyani?"

qa.run(
    query,
    callbacks=[handler]
)

# Indexing a GitHub repo

In [None]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./data/repo/",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch="master",
)

documents = loader.load()

In [None]:
print(documents[0].page_content)

In [None]:
len(documents)

In [None]:
from langchain.text_splitter import Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1000,
    chunk_overlap=200
)

documents = python_splitter.split_documents(documents)

In [None]:
documents[0]

In [None]:
len(documents)

In [None]:
index = FAISS.from_documents(documents, embeddings)
retriever = index.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

query = "What is a stuff chain?"

qa.run(query, callbacks=[handler])

In [None]:
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs['fetch_k'] = 200
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

query = "When should I use a map reduce chain?"

qa.run(query, callbacks=[handler])

In [None]:
query = "When should I use a map rank chain?"

qa.run(query, callbacks=[handler])