In [1]:
import os
os.environ["LANGCHAIN_PROJECT"] = "fyp-rag-experiments"

In [2]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [3]:
#### INDEXING ####
from bs4 import BeautifulSoup
from markdown import markdown
import re

def markdown_to_text(markdown_string):
    html = markdown(markdown_string)
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))
    return text

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

texts = list()
for file_path in os.listdir("../data/cleaned"):
    if not file_path.endswith(".md"):
        continue

    with open("../data/cleaned/" + file_path, "r") as file:
        text = file.read()
        text = text.strip()
        texts.append(text)

print(len(texts))

8


In [60]:
# Split
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

document_chunks = list()
for text in texts:
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(text)
    
    text_splitter = SemanticChunker(
        OpenAIEmbeddings(), breakpoint_threshold_type="interquartile"
    )
    
    list_of_strings = [markdown_to_text(split.page_content) for split in md_header_splits]
    metadatas = [split.metadata for split in md_header_splits]
    
    document_chunks.extend(text_splitter.create_documents(list_of_strings, metadatas=metadatas))

In [3]:
# Index

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# vectorstore = Chroma.from_documents(
#     document_chunks, 
#     embedding=OpenAIEmbeddings(), 
#     persist_directory="./slp_vectordb"
# )

vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(), 
    persist_directory="./slp_vectordb"
)
retriever = vectorstore.as_retriever()

In [4]:
test_retrieval = retriever.get_relevant_documents("Lemmatization")
print(len(test_retrieval))
print(test_retrieval)

4
[Document(page_content="Some languages, like Japanese, don't have spaces between words, so word tokenization becomes more difficult. Another part of text normalization is lemmatization, the task of determining lemmatization that two words have the same root, despite their surface differences. For example, the words sang, sung, and sings are forms of the verb sing. The word sing is the common lemma of these words, and a lemmatizer maps from all of these to sing. Lemmatization is essential for processing morphologically complex languages like Arabic. Stemming refers to a simpler version of lemmatization in which we mainly stemming just strip suffixes from the end of the word. Text normalization also includes sentence segmentation: breaking up a text into individual sentences, using cues like sentence segmentation periods or exclamation points."), Document(page_content='For other natural language processing situations we also want two morphologically different forms of a word to behave 

In [5]:
user_prompt = str()
with open("../prompts/rag-user-prompt.txt", "r") as file:
    user_prompt = file.read()

system_prompt = str()
with open("../prompts/rag-system-prompt.txt", "r") as file:
    system_prompt = file.read()

In [17]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads

llm = ChatOpenAI(model_name="gpt-4-0125-preview")

local_llm = ChatOllama(
    model="gemma",
    num_ctx=4096,
    top_p=0.9,
    top_k=100
)

def get_unique_union(documents):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    to_return = [loads(doc) for doc in unique_docs]
    return to_return

def prepare_context(docs):
    docs = get_unique_union([docs])
    references = lambda metadata: "\n".join(f"{k} {v}" for k, v in metadata.items())
    
    formatter = lambda content, metadata: f"""
    {content} 
    
    References from the textbook: 
    {references(metadata)}
    """

    formatted_content = "\n\n".join(formatter(doc.page_content, doc.metadata) for doc in docs)

    system_prompt = """
    You are a professional english data cleaner. Your role is to read the documents extracted from a book,
    clean it without degrading the quality of the text. 

    You should follow these guidelines: 
    - Pay utmost emphasis on preserving the actual ground truth of the text. It is completely fine to repeat
      the given sentences verbatim if they fit. 
    - The given text is in markdown. Don't omit any necessary tables, inline equations or block equations. 
    - Keep the references format as is. Right after the document's content.  
    - Your response should be formatted as follows:
      ## Context 1
         ...
         References: ...
      ## Context 2
         ...
         References: ...
      ## Context 3
         ...
         References: ...
    """

    user_prompt = """
    Clean the following text without compromising on truth: 

    {context}
    """

    clean_text_prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", user_prompt)
    ])
    
    chain = (
        { "context": RunnablePassthrough() }
        | clean_text_prompt 
        | llm 
        | StrOutputParser()
    )

    response = str(chain.invoke(formatted_content))
    return response

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)
])

main_chain = (
    { "topics": RunnablePassthrough(), "context": retriever | prepare_context } 
    | prompt 
    | llm 
    | StrOutputParser()
)

In [18]:
response = str(main_chain.invoke("""
Kneser-Ney Smoothing
"""))
print(response)
with open("../responsebuffers/test.md", "w") as file:
    file.write(response)

## Question
In the context of Kneser-Ney smoothing, especially the modified Kneser-Ney version, consider the scenario where you are tasked with estimating the probabilities of various bigrams in a large corpus. Knowing that modified Kneser-Ney smoothing uses different discounts for n-grams with counts of 1, 2, and three or more, which of the following best describes the purpose of using three different discounts ($d_1$, $d_2$, and $d_{3+}$) in the algorithm?

1. To ensure that all n-grams with a count higher than three contribute equally to the probability estimation.
2. To increase the computational efficiency of the algorithm by reducing the complexity of calculations for frequent n-grams.
3. To provide a more nuanced approach to discounting, reflecting the varying degrees of reliability of n-grams based on their frequency.
4. To allocate more probability mass to unigrams, thereby simplifying the model to behave more like a unigram model.
5. To guarantee that the probabilities of all