In [1]:
USER_PROMPT = """
The dialogue above is from **ELIZA**, an early natural language processing system
ELIZA
that could carry on a limited conversation with a user by imitating the responses of a Rogerian psychotherapist (Weizenbaum, 1966). ELIZA is a surprisingly simple program that uses pattern matching to recognize phrases like "I need X" and translate them into suitable outputs like "What would it mean to you if you got X?". This simple technique succeeds in this domain because ELIZA doesn't actually need to know anything to mimic a Rogerian psychotherapist. As Weizenbaum notes, this is one of the few dialogue genres where listeners can act as if they know nothing of the world. ELIZA's mimicry of human conversation was remarkably successful: many people who interacted with ELIZA came to believe that it really *understood* them and their problems, many continued to believe in ELIZA's abilities even after the program's operation was explained to them (Weizenbaum, 1976), and even today
such **chatbots** are a fun diversion.
chatbots
Of course modern conversational agents are much more than a diversion; they can answer questions, book flights, or find restaurants, functions for which they rely on a much more sophisticated understanding of the user's intent, as we will see in Chapter 15. Nonetheless, the simple pattern-based methods that powered ELIZA and other chatbots play a crucial role in natural language processing.
"""

In [8]:
import re
from bs4 import BeautifulSoup
from markdown import markdown

def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

In [15]:
from rich import print
print(markdown_to_text(USER_PROMPT).replace("\n", " "))

  text = ''.join(soup.findAll(text=True))


In [10]:
from transformers import AutoTokenizer
from gector import GECToR, predict, load_verb_dict

model_id = 'gotutiyan/gector-roberta-base-5k'
model = GECToR.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
encode, decode = load_verb_dict('data/verb-form-vocab.txt')
srcs = [
    USER_PROMPT
]d
corrected = predict(
    model, tokenizer, srcs,
    encode, decode,
    keep_confidence=0.0,
    min_error_prob=0.0,
    n_iteration=5,
    batch_size=2,
)
print(corrected)

ModuleNotFoundError: No module named 'gector'

In [20]:
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import MarkdownHeaderTextSplitter

from langchain import hub
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv

from bs4 import BeautifulSoup
from markdown import markdown
import re

def markdown_to_text(markdown_string):
    html = markdown(markdown_string)
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))
    return text

load_dotenv("../.env")

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

text = str()
with open("../data/cleaned/2.md", "r") as file:
    text = file.read()
    text = text.strip()
    paragraphs = text.split("\n\n")
    paragraphs = [markdown_to_text(para).replace("\n", " ") for para in paragraphs]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(text)

text_splitter = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="interquartile"
)

list_of_strings = [markdown_to_text(split.page_content) for split in md_header_splits]
metadatas = [split.metadata for split in md_header_splits]

docs = text_splitter.create_documents(list_of_strings, metadatas=metadatas)
print(docs)

In [38]:
vectorstore = Chroma.from_documents(documents=docs,
                                    embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

user_prompt = """
Create a multiple choice question (MCQ) and solution that covers the following topic: {topic}

Only make use of the textbook content below: {context} 
"""

system_prompt = """
You are a lecturer for an advanced undergraduate natural language processing course.
Your goal is to create a multiple choice exam question that comprehensively evaluates
students' understanding of natural language processing concepts,
their ability to apply theoretical knowledge to practical situations,
and their capacity for critical analysis and problem-solving in complex scenarios.

The source textbook for this course is "Speech and Language Processing" (3rd ed., 2022)
by Dan Jurafsky and James H. Martin. The questions should be constructed from the content 
retrieved from the textbook. 

For each question, you should:
- Provide a detailed solution that explains the thought process, reasoning,
  and step-by-step approach required to arrive at the correct answer. 
  
- The solution should demonstrate a deep understanding of the underlying
  concepts and their practical applications. The solution must be deduced from the knowledge 
  obtained from the textbook and you should explain how. 

The question itself should meet the following criteria:
- Be a multiple choice question (MCQ) with 5 choices in markdown format:
  1. Choice 1
  2. Choice 2
  3. Choice 3
  4. Choice 4
  5. Choice 5

- Should utilize the content given to you, which includes relevant textbook material. 

- A student reading the textbook should be able to figure out the answer for the question. Don't go 
  beyond the materials of the textbook at all. 

- Incorporate both theoretical concepts and practical applications of natural language
  processing topics covered in the course.

- Be of a high difficulty level, challenging students to apply their knowledge in novel
  and complex scenarios, rather than relying on rote memorization or simple recall.

- Require a unique synthesis of ideas from multiple topics, concepts, and sources,
  going beyond questions commonly found in standard textbooks. 

- Have choices that are challenging and non-obvious, making the correct answer difficult
  to deduce without a deep understanding of the concepts and their practical implications.

- Your output should only be in markdown format, with the following headers:
  ## Question
  ## Solution
  ## Reasoning

- Inline equations should use the markdown format: $a = b + c$
- Block equations should use the markdown format: $$a = b + c$$
"""

llm = ChatOpenAI(model_name="gpt-4-0125-preview")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", user_prompt)
])

chain = (
    { "topic": RunnablePassthrough(), "context": retriever | format_docs } | prompt | llm | StrOutputParser()
)

In [39]:
response = str(chain.invoke("Text Normalization for Lemmatization"))
with open("../responsebuffers/test.md", "w") as file:
    file.write(response)

In [14]:
import os 
from itertools import accumulate

DIRECTORY = "../responsebuffers/zeroshot-vanilla-gpt4"

testfilter = lambda x: x != "test"
directories = list(filter(testfilter, os.listdir(DIRECTORY)))

for directory in directories:
    print(f"{directory} - {len(os.listdir(os.path.join(DIRECTORY, directory)))}")

07 - 10
10 - 10
16 - 10
06 - 10
17 - 10
02 - 10
05 - 10
18 - 10
19 - 9
04 - 10
09 - 20
21 - 10
03 - 10
08 - 11
20 - 10
15 - 10
01 - 10
