## Load the Data from the Webpages

In [None]:
from langchain.document_loaders import WebBaseLoader
import json
from bs4 import BeautifulSoup
with open('ressources/sources.json', 'r') as config_file:
    config = json.load(config_file)

def clean_document(document):
    page_content = document.page_content
    soup = BeautifulSoup(page_content, 'html.parser')
    clean_text = ' '.join(soup.get_text().split())
    return clean_text

# Extract the 'urls' list from the configuration
urls = config.get('urls', [])
loader = WebBaseLoader(urls)

data = loader.load()
for doc in data:
    cleaned_text = clean_document(doc)
    doc.page_content = cleaned_text

data


In [None]:
import os
from langchain.document_loaders import PyPDFLoader

pdf_dir = os.getcwd()+"\docs\Chatverlauf\\"

pdf_files = []

for filename in os.listdir(pdf_dir):
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdf_dir, filename)
        pdf_files.append(file_path)

pdf_files

## Split the loaded Text into Chunks. Then use Chroma to vectorestore all the documents via openai. 

In [None]:
from ressources import config
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
data_splits = text_splitter.split_documents(data)

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

api_key = config.OPENAI_API_KEY

openai_embeddings = OpenAIEmbeddings(openai_api_key=api_key)

vectorstore = Chroma.from_documents(documents=data_splits, embedding=openai_embeddings)

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    page = (loader.load())
    page_splits = text_splitter.split_documents(page)
    vectorstore.add_documents(page_splits)

#### Use the Vector Store as a retriever. If we now pass a Question to the OpenAI API the vectorstore makes sure to pass the context that fits the question most based on the vectors. We also pass the memory so the model "remembers" the entire conversation

In [None]:
from langchain.memory import ConversationSummaryMemory
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

openai_api_key = api_key
llm = ChatOpenAI(openai_api_key=openai_api_key)
memory = ConversationSummaryMemory(llm=llm)
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)


retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [None]:
qa("Welche verschiedenen Studengänge gibt es")
