In [1]:
import os
os.environ["LANGCHAIN_PROJECT"] = "fyp-rag-experiments"

In [2]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [7]:
#### INDEXING ####
from bs4 import BeautifulSoup
from markdown import markdown
import re

def markdown_to_text(markdown_string):
    html = markdown(markdown_string)
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))
    return text

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

texts = list()
for file_path in os.listdir("../data/cleaned"):
    if not file_path.endswith(".md"):
        continue

    with open("../data/cleaned/" + file_path, "r") as file:
        text = file.read()
        text = text.strip()
        texts.append(text)

print(len(texts))

19


In [8]:
# Split
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

document_chunks = list()
for text in texts:
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(text)
    
    text_splitter = SemanticChunker(
        OpenAIEmbeddings(), breakpoint_threshold_type="interquartile"
    )
    
    list_of_strings = [markdown_to_text(split.page_content) for split in md_header_splits]
    metadatas = [split.metadata for split in md_header_splits]
    
    document_chunks.extend(text_splitter.create_documents(list_of_strings, metadatas=metadatas))

In [3]:
# Index

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# vectorstore = Chroma.from_documents(
#     document_chunks, 
#     embedding=OpenAIEmbeddings(), 
#     persist_directory="./slp_vectordb"
# )

vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(), 
    persist_directory="./slp_vectordb"
)
retriever = vectorstore.as_retriever()

In [4]:
test_retrieval = retriever.get_relevant_documents("POS Tagging")
print(len(test_retrieval))
print(test_retrieval)

4
[Document(page_content='Sequence labelling tasks, such as part-of-speech tagging or BIO-based named entity recognition, follow the same basic classification approach. Here, the final output vector corresponding to each input token is passed to a classifier that produces a softmax distribution over the possible set of tags. Again, assuming a simple classifier consisting of a single feedforward layer followed by a softmax, the set of weights to be learned for this additional layer is WK ∈ Rk×dh, where k is the number of possible tags for the task. As with RNNs, a greedy approach, where the argmax tag for each token is taken as a likely answer, can be used to generate the final output tag sequence. Fig. 11.11 illustrates an example of this approach. yi = softmax(WKzi)\n(11.14)\nti = argmaxk(yi)\n(11.15)\nAlternatively, the distribution over labels provided by the softmax for each input token can be passed to a conditional random field (CRF) layer which can take global tag-level transiti

In [22]:
user_prompt = str()
with open("../prompts/rag-user-prompt.txt", "r") as file:
    user_prompt = file.read()

system_prompt = str()
with open("../prompts/rag-system-prompt.txt", "r") as file:
    system_prompt = file.read()

In [23]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads

llm = ChatOpenAI(model_name="gpt-4-0125-preview")

local_llm = ChatOllama(
    model="gemma",
    num_ctx=4096,
    top_p=0.9,
    top_k=100
)

def get_unique_union(documents):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    to_return = [loads(doc) for doc in unique_docs]
    return to_return

def prepare_context(docs):
    docs = get_unique_union([docs])
    references = lambda metadata: "\n".join(f"{k} {v}" for k, v in metadata.items())
    
    formatter = lambda content, metadata: f"""
    {content} 
    
    References from the textbook: 
    {references(metadata)}
    """

    formatted_content = "\n\n".join(formatter(doc.page_content, doc.metadata) for doc in docs)

    system_prompt = """
    You are a professional english data cleaner. Your role is to read the documents extracted from a book,
    clean it without degrading the quality of the text. 

    You should follow these guidelines: 
    - Pay utmost emphasis on preserving the actual ground truth of the text. It is completely fine to repeat
      the given sentences verbatim if they fit. 
    - The given text is in markdown. Don't omit any necessary tables, inline equations or block equations. 
    - Keep the references format as is. Right after the document's content.  
    - Your response should be formatted as follows:
      ## Context 1
         ...
         References: ...
      ## Context 2
         ...
         References: ...
      ## Context 3
         ...
         References: ...
    """

    user_prompt = """
    Clean the following text without compromising on truth: 

    {context}
    """

    clean_text_prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", user_prompt)
    ])
    
    chain = (
        { "context": RunnablePassthrough() }
        | clean_text_prompt 
        | llm 
        | StrOutputParser()
    )

    response = str(chain.invoke(formatted_content))
    return response

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)
])

main_chain = (
    { "topics": RunnablePassthrough(), "context": retriever | prepare_context } 
    | prompt 
    | llm 
    | StrOutputParser()
)

In [25]:
response = str(main_chain.invoke("""
Context-Free Grammars
"""))
print(response)
with open("../responsebuffers/test.md", "w") as file:
    file.write(response)

## Question
Given the definitions and concepts outlined in the provided textbook content, which of the following statements best illustrates the concept of "structural ambiguity" in the context of natural language processing and context-free grammars (CFGs)?

1. Structural ambiguity occurs when a grammar cannot generate any string of terminal symbols from the start symbol.
2. It happens when a single string of terminal symbols can be derived using different sequences of rule applications, leading to multiple possible parse trees.
3. Structural ambiguity arises solely from the limitations of the CKY algorithm when parsing sentences in natural language.
4. It is the result of having too many non-terminal symbols in a CFG, leading to unclear derivation paths for strings.
5. Structural ambiguity is related to the inability of a context-free grammar to be converted into Chomsky Normal Form (CNF).

## Solution
The correct answer is: 2. It happens when a single string of terminal symbols can 