<a href="https://colab.research.google.com/github/priyariyyer/AIML_Projects/blob/main/ChatoBot_GitBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Understanding Problem and Objective

In [None]:
# The objective of this project is to create a Chatbot which can query topics mentioned in GitBook Documentation.
# The project will help summarize topics without going through all pages of the GitBook documentation.

# Data Gathering

In [None]:
! pip install langchain_community



# Data Preparation

In [None]:
# import required libraries
import langchain
from langchain_community.document_loaders import GitbookLoader



In [None]:
#load all details from gitbook documentation
loader = GitbookLoader("https://docs.gitbook.com/")
all_docs = loader.load_and_split()
all_docs

[Document(metadata={'source': 'https://docs.gitbook.com/', 'title': 'GitBook Documentation'}, page_content='GitBook Assistant\nAsk\nGetting Started\nGitBook Documentation\nCreate and publish beautiful documentation your users will love. GitBook has all the tools you need to create everything from product guides to API references and beyond.\nAt GitBook, our mission is to provide a \nuser-friendly\n and \ncollaborative\n solution for creating, editing, and sharing product and API documentation.\nSign up\n \nQuickstart\nDiscover GitBook \nCreating content\nEdit pages, content and more in GitBook.\nPublishing documentation\nPublish your docs site to share with others.\nCollaboration\nInvite your team and collaborate in GitBook.\nGit Sync\nSync with a Git repository to enable advanced workflows.\nIntegrations\nIntegrate with your favorite tools and applications.\nAccount management\nManage your organization, account, and billing.\nLast updated \n3 months ago\nWas this helpful?')]

In [None]:
#install transformer for tokenization
! pip install transformers



## Tokenize Text

In [None]:
from transformers import AutoTokenizer
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model_basename = "gptq_model-4bit-128g"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
#Get Token counts
token_counts = [len(tokenizer(doc.page_content, return_tensors='pt').input_ids.cuda()[0]) for doc in all_docs]

In [None]:
print(f"Min Count: {min(token_counts)}")
print(f"Max Count: {max(token_counts)}")
print(f"Avg Count: {sum(token_counts)/len(token_counts)}")

Min Count: 208
Max Count: 208
Avg Count: 208.0


In [None]:
token_counts

[208]

In [None]:
# Get len of tokenizer for given text
def tokenizer_len(text):
  return len(tokenizer(text, return_tensors='pt').input_ids.cuda()[0])

## Create Embeddings

In [None]:
#Create instance of text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=0, length_function=tokenizer_len, separators=["\n\n", "\n", " ", ""])

In [None]:
import hashlib
hasher = hashlib.sha256()

In [None]:
#Use TQDM for progress bar
from tqdm.auto import tqdm
documents = []

In [None]:
#extract the chunks from all documents
for doc in tqdm(all_docs):
  url = doc.metadata['source']
  hasher.update(url.encode('utf-8'))
  uid = hasher.hexdigest()[:12]

  chunks = text_splitter.split_text(doc.page_content)
  for i, chunk in enumerate(chunks):
    documents.append({
        'id': f"{uid}-{i}",
        'url': url,
        'text': chunk
    })

  len(documents)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#convert chunks into dataframe
import pandas as pd
df = pd.DataFrame.from_records(documents)
df.head()

Unnamed: 0,id,url,text
0,1da01c62de55-0,https://docs.gitbook.com/,GitBook Assistant\nAsk\nGetting Started\nGitBo...


In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device='cuda')

In [None]:
df['text'][0]

'GitBook Assistant\nAsk\nGetting Started\nGitBook Documentation\nCreate and publish beautiful documentation your users will love. GitBook has all the tools you need to create everything from product guides to API references and beyond.\nAt GitBook, our mission is to provide a \nuser-friendly\n and \ncollaborative\n solution for creating, editing, and sharing product and API documentation.\nSign up\n \nQuickstart\nDiscover GitBook \nCreating content\nEdit pages, content and more in GitBook.\nPublishing documentation\nPublish your docs site to share with others.\nCollaboration\nInvite your team and collaborate in GitBook.\nGit Sync\nSync with a Git repository to enable advanced workflows.\nIntegrations\nIntegrate with your favorite tools and applications.\nAccount management\nManage your organization, account, and billing.\nLast updated \n3 months ago\nWas this helpful?'

In [None]:
embeds = embedding_model.encode(df['text'][0])

In [None]:
len(embeds)

384

In [None]:
#Store Knowledge Base content in a vector db
# !pip install pinecone-client==3.0.0



In [None]:
# !pip install langchain-pinecone==0.0.3

Collecting langchain-core<0.2,>=0.1 (from langchain-pinecone==0.0.3)
  Using cached langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-core<0.2,>=0.1->langchain-pinecone==0.0.3)
  Using cached langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Using cached langchain_core-0.1.53-py3-none-any.whl (303 kB)
Using cached langsmith-0.1.147-py3-none-any.whl (311 kB)
Installing collected packages: langsmith, langchain-core
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.4.16
    Uninstalling langsmith-0.4.16:
      Successfully uninstalled langsmith-0.4.16
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.74
    Uninstalling langchain-core-0.3.74:
      Successfully uninstalled langchain-core-0.3.74
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency 

In [None]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key='pcsk_5cfcgQ_UGrAmbqtCpkc73mVjR3WXPK3BBAmHMCQNeni4bKWhhH2Gj1wsfcjDzdjZ3rQwa1')
indexname = "gitbookindex"
# pc.create_index(
#     name = indexname,
#     dimension = len(embeds),
#     metric = 'cosine',
#     spec = ServerlessSpec(
#         cloud='aws', region='us-east-1')
# )

In [None]:
index = pc.Index(indexname)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_size= 10
metadata = []

for i in tqdm(range(0, len(df), batch_size)):
  i_end = min(i+batch_size, len(df))
  batch = df.iloc[i:i_end]
  metadatas = [{
      'source':record['url'],
      'text':record['text']
  } for j, record in batch.iterrows()]

  pinecode_documents = batch['text']
  embeds = embedding_model.encode(list(pinecode_documents)).tolist()
  print(len(embeds))

  ids = batch['id']
  index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/1 [00:00<?, ?it/s]

1


In [None]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}

In [None]:
def embed_query(text):
  embedding = embedding_model.encode(text).tolist()
  return embedding

In [None]:
from langchain.vectorstores import Pinecone
text_field = "text"

vectorstore = Pinecone(
    index, embed_query, text_field)

TypeError: `localns` arguments are not longer accepted.

In [None]:
#create vectorstore
from langchain.vectorstores import Pinecone
vectorstore = Pinecone.from_existing_index(indexname, embedding_model)



TypeError: `localns` arguments are not longer accepted.