<a href="https://colab.research.google.com/github/samtru99/PDF-Chatbot/blob/main/pdf_chatbot_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
!pip install -qU langchain pinecone-client tiktoken openai faiss-cpu PyPDF2

In [None]:
#LangChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
import langchain
#OpenAI
import openai
from openai import OpenAI
#Others
import os
import re
import pinecone
import tiktoken
from PyPDF2 import PdfReader

# Phase 1 - Extracing data from the PDF

***Link to Google Drive***

In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
root_dir = "content/gdrive/My Drive"

reader = PdfReader('/Path/Here/To/PDF')

Mounted at /content/gdrive


***Extract raw Text from PDF***

In [None]:
raw_text = ''
for i, page in enumerate(reader.pages):
  text = page.extract_text()
  if text:
    raw_text += text

# Phase 2 - Data Preparation/Preprocessing

Tokenizer function to calculate the total amount of tokens required to process

In [None]:
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create a length function
def tiktoken_len(text):
  tokens = tokenizer.encode(
      text,
      disallowed_special=()
  )
  return len(tokens)

Utilize a text splitter function to break down the raw text into chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators = ['\n\n', '\n', ' ', '']
)

Break down the raw text into chunks of less than 400 tokens

In [None]:
chunks = text_splitter.split_text(raw_text)

Clean up text to reduce the amount of tokens needed

In [None]:
def clean_text_func(text):
    clean_text = re.sub(r'(\r\n|\r|\n){2,}', r'\n', text)
    clean_text = re.sub(r'[ \t]+', ' ', clean_text)
    clean_text = re.sub(r'[\n\n]', '', clean_text)
    return clean_text

# Phase 3 - Embed Text and Store Vectors in Pinecone

Utilize OpenAI Embedding

In [None]:
model = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key = "ENTER YOUR KEY HERE"
)

Create a Pinecone Database

In [None]:
pinecone.init(
    api_key = 'ENTER YOUR KEY HERE',
    environment = 'gcp-starter'
)

if 'csedb' not in pinecone.list_indexes():
  pinecone.create_index('csedb', dimension = 1536)

index = pinecone.Index('csedb')


Embed and store all chunks



In [None]:
for i in range(len(chunks)):
  clean_txt = clean_text_func(chunks[i])
  em_vector = model.embed_documents(clean_txt)
  meta = [{
      'text': clean_txt
  }]
  id = f"chunk_{i}"
  index.upsert(
      vectors=[
          {
              'id': id,
              'values': em_vector,
              'metadata': {'text': clean_txt}
          }
      ]
  )

# Phase 4 - Query User Questions

Initialize ChatGPT and Pinecone

In [None]:
text_field = "text"
vectorstore = Pinecone(
    index, model, text_field
)

chat = ChatOpenAI(
    openai_api_key="ENTER YOUR KEY HERE",
    model="gpt-3.5-turbo"
)

Initialize HyDE to be used in semantic searches

In [None]:
hyDE_embedding = HypotheticalDocumentEmbedder.from_llm(
    chat, model, prompt_key="web_search"
)

In [None]:
def end_convo():
  return False


'''
  Main portion of the program
'''
def continue_convo(messages):

  def augmented(query: str):
    hyDE_ans = hyDE_embedding.embed_query(query)
    results = index.query(top_k=3,vector = hyDE_ans, include_metadata=True)
    source_knowledge = "\n".join([x['metadata']['text'] for x in results['matches']])
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""

    return augmented_prompt
  question = input("Enter in Question")
  prompt = HumanMessage(
      content = augmented(question)
  )
  messages.append(prompt)
  ai_response = chat(messages)
  print(f"AI: \n{ai_response.content}")
  messages.append(ai_response)
  return True

def new_convo():
  messages.clear()
  messages.append(SystemMessage(content="You are a helpful assistant"))
  return True

def default_case():
  print("Invalid Input")
  return True


convo = True
switch_dict = {
      'E': end_convo,
      'C': continue_convo,
      'N': new_convo
  }

messages = [
    SystemMessage(content="You are a helpful assistant")
]
while convo:
  if len(messages) == 1:
    print("Please ask a question to begin the conversation")
    continue_convo(messages)
  else:
    user_input = input(f"End Conversation (E)\nContinue Conversation(C)\nNew Conversation(N)")
    action = switch_dict.get(user_input, default_case)

    if action == continue_convo:
      convo = action(messages)
    else:
      convo = action()




print("convo over")