In [None]:
import sys
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
import chromadb

In [None]:
# Find and Parse Sitemaps to Create List of all website's pages
import usp
from usp.tree import sitemap_tree_for_homepage

def getPagesFromSitemap(fullDomain):
    listPagesRaw = []
    tree = sitemap_tree_for_homepage(fullDomain)
    for page in tree.all_pages():
        listPagesRaw.append(page.url)
    return listPagesRaw

# Go through List Pages Raw output a list of unique pages links
def getListUniquePages(listPagesRaw):
    listPages = []
    for page in listPagesRaw:
        if page in listPages: 
            pass
        else: 
            listPages.append(page)
    return listPages


In [None]:
os.environ["OPENAI_API_KEY"] = "input key here"

In [None]:
user_input = 0
while (user_input!="1") and (user_input!="2"):
    user_input = input('Please enter "1" in case you want to upload the files or "2" if you want to fetch the files from the url'"\n")

In [None]:
if user_input=="1":
    j=0
    
    lan_input = ''
    while (lan_input!="html") and (lan_input!="pdf"):
        lan_input = input('Please enter "html" in case you want to upload the html files or "pdf" if you want to upload PDF files'"\n")
    
    if lan_input=='pdf':
        path_input = "enter your path"
        documents = []
        for file in os.listdir(path_input):
            pdf_path = path_input + file
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
            j = j+1
            print(j,' of ', len(os.listdir(path_input)), ' PDFs processed')
    
    if lan_input=='html':
        path_input = "enter your path"
        for file in os.listdir(path_input):
            html_path = path_input + file
            loader = UnstructuredHTMLLoader(html_path)
            documents.extend(loader.load())
            j = j+1
            print(j,' of ', len(os.listdir(path_input)), ' html files processed')

In [None]:
if user_input=="2":
    url_input = input('Please enter the url that will be crawled: ')
    test = getPagesFromSitemap(url_input)
    urls = getListUniquePages(test)
    #create a list containing the path names that the files will be created in
    path = []
    path_input = input('Please enter the path were the files will be stored: ')
    for i in range(len(urls)):
        path.append(path_input + str(i) + '.html')

In [None]:
 #save the html code to htmls files
import requests
if user_input=="2":
    i = 0
    for url in urls:
        text_file = open(path[i], "w", encoding="utf-8")
        text_file.write(requests.get(url = url).text)
        text_file.close()
        i = i + 1

In [None]:
if user_input=="2":
    documents = []
    for i in range(len(urls)):
        loader = UnstructuredHTMLLoader(f'enter your path here'{i}.html')
        documents.extend(loader.load())

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
documents = text_splitter.split_documents(documents)
for i in range(len(documents)):
    documents[i].page_content = documents[i].page_content.replace('\n',' ')

In [None]:
vectordb = FAISS.from_documents(documents, embedding=OpenAIEmbeddings())

In [None]:
template = """You are an AI assistant for answering questions about the university of limassol website.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
QA_PROMPT = PromptTemplate(template=template, input_variables=[
                           "question", "context"])

In [None]:
pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
    vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    verbose=False,
    max_tokens_limit=2000,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT}
)

In [None]:
yellow = "\033[0;33m"
green = "\033[0;32m"
white = "\033[0;39m"

chat_history = []
print(f"{yellow}---------------------------------------------------------------------------------")
print('Welcome to the AI Chabot. Ask me questions about your documents')
print('---------------------------------------------------------------------------------')
while True:
    query = input(f"{green}Prompt: ")
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        sys.exit()
    if query == '':
        continue
    result = pdf_qa(
        {"question": query, "chat_history": chat_history})
    print(f"{white}Answer: " + result["answer"])
    chat_history.append((query, result["answer"]))