# Requirements

In [1]:
# RUN THIS CELL FIRST!
!pip install -q langchain pypdf2 tiktoken textract openai faiss-cpu huggingface_hub pypdfium2 InstructorEmbedding sentence-transformers googletrans==3.1.0a0 python-docx contractions -q

# Importing Libraries

In [1]:
# Importing Libraries
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFaceHub
from langchain import OpenAI
from langchain.prompts import ChatPromptTemplate
import os
import contractions
import pypdfium2 as pdfium
import re
import glob
import unicodedata
from typing import List
from googletrans import Translator
import time
import json


## Download documents to the file section of the notebook

In [3]:
folder_id = '1O4AO1HxRGbL-IfG__UpJORo50Hl0oN9z'

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

folders = [folder_id]

for folder in folders:
  file_list = drive.ListFile({'q': f"'{folder}' in parents and trashed=false"}).GetList()
  for file in file_list:
    file_id = file['id']
    file_name = file['title']

    downloaded = drive.CreateFile({'id': file_id})
    downloaded.FetchMetadata(fetch_all=True)
    downloaded.GetContentFile(downloaded.metadata['title'])


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Helper functions

In [2]:
#translate answer to English if the question is brought in another language

translator = Translator()

def detect_and_translate(text, answer=None):
    result_lang = translator.detect(text)

    if result_lang.lang == "en":
        translate_text = text if answer is None else answer
    else:
        dest_lang = "en" if answer is None else result_lang.lang
        translate_text = translator.translate(text, src='auto', dest=dest_lang).text

    return translate_text



#convert text to document objects
def text_to_docs(text: str) -> List[Document]:
    if isinstance(text, str):
        # Take a single string as one page
        text = [text]
    page_docs = [Document(page_content=page) for page in text]
    # Add page numbers as metadata
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    return doc

#clean text data
def text_clean(text: str) -> str:
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
    text = contractions.fix(text)
    text = re.sub(r"(?<!\n\s)\n(?!\s\n)|(?<!\r\s)\r(?!\s\r)", " ", text.strip())
    text = re.sub(r"\n\s*\n", "\n\n", text)
    text = re.sub(r'https?://\S+', '', text)
    text = unicodedata.normalize('NFKD', text)
    return text



#convert pdf to text
def pdfium_get_text(data: bytes) -> str:
    output = ""
    pdf = pdfium.PdfDocument(data)
    for i in range(len(pdf)):
        page = pdf.get_page(i)
        textpage = page.get_textpage()
        text = textpage.get_text_range()
        text = text_clean(text)
        output+=text + "\n"
    return output


# Processing all pdf files

In [3]:
docs=[]
pdf_files =glob.glob('/content/*.pdf')
for pdf in pdf_files:
  with open(pdf,"rb") as f:
    data = f.read()
    entire_text=pdfium_get_text(data)
    mydoc=text_to_docs(entire_text)
    docs.append(mydoc)



# Splitting docs into chunks

In [4]:
#split documents
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=len)
chunked_docs = splitter.split_documents(docs)

## Embedding

In [5]:
key_hf = json.load(open("hf_credential.json"))
HUGGINGFACEHUB_API_TOKEN=key_hf['key']

In [6]:
embedding_model = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
#create embedding out of content and save it
embedding = FAISS.from_documents(chunked_docs,embedding_model)
embedding.save_local("/content/drive/MyDrive/Chatbot/data/index/")

In [7]:
#load embedding
embedding = FAISS.load_local("/content/drive/MyDrive/Chatbot/data/index",embedding_model)

In [8]:
def doc_to_rscope(docs):
  rscope = ""
  for doc in docs:
    rscope += doc.page_content + "\n\n"
  return rscope

## Load OPENAI credentials

In [9]:
import openai
key_openai = json.load(open("openai_credential.json"))
os.environ['OPENAI_API_KEY'] = key_openai['key'][0]

## QA chatbot

In [10]:
def qa_chatbot(query):
  scope_doc = embedding.similarity_search(query=query)
  context = doc_to_rscope(scope_doc)
  memory = ConversationBufferMemory(k=10,memory_key='chat_history')
  chat_text = """
  Given the following context information about statistics knowledge

  Act as a frequently asked questions chatbot and answer any question
  asked factually with the right information.

  If the answer cannot be found in the context, answer 'The question asked
  is beyond the scope of this textbook'

  Context: {context}
  Question: {query}
  """
  llm = ChatOpenAI(model='gpt-3.5-turbo',temperature = 0.1)
  prompt_template = ChatPromptTemplate.from_template(chat_text)
  chatgpt_llm_chain = LLMChain(prompt=prompt_template, llm=llm)
  answer = chatgpt_llm_chain.run(context=chat_text,
                            query=query)
  return answer

In [11]:
qa_chatbot(query = 'what is a random variable?')

'A random variable is a variable that can take on different values based on the outcome of a random event. It represents a numerical quantity that is determined by chance. Random variables can be discrete, meaning they can only take on specific values, or continuous, meaning they can take on any value within a certain range. Random variables are a fundamental concept in statistics and probability theory.'

In [12]:
qa_chatbot(query = 'what is the central limit theorem')

"The central limit theorem states that when independent random variables are added, their sum tends toward a normal distribution, regardless of the shape of the original variables' distribution. This theorem is important in statistics because it allows us to make inferences about a population based on a sample."