In [1]:
from datasets import load_from_disk
dataset = load_from_disk("./src/generated_questions_dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

Dataset({
    features: ['id', 'created_at', 'updated_at', 'deleted_at', 'knowledge_base_id', 'answer_id', 'data_source', 'question_type', 'question', 'label_method_type', 'section_id', 'hash', 'language'],
    num_rows: 44964
})

In [3]:
for i in dataset:
    print(i['question'])
    break

hvordan håndterer man jord og grundvand


In [4]:
import aiofiles
import aiohttp
from aiohttp import ClientSession
from datasets import Dataset
from typing import IO
import json
import pydantic
import asyncio
import itertools
import more_itertools




class MistypeQuestionsRequest(pydantic.BaseModel):
    """Questions typo request."""

    language: str
    questions: list[str]
    model_card: str = "azure"
    num_questions: int | None = None

class MistypedQuestion(pydantic.BaseModel):
    """Mistyped question for the mistyping task."""

    mistyped_question: str = pydantic.Field(..., description="A mistyped question.")
class MistypeQuestionsResponse(pydantic.BaseModel):
    """Questions typo response."""

    language: str | None = None
    mistyped_questions: list[MistypedQuestion] | None = None
    num_questions: int | None = None
    message: str | None = None
    code: int | None = None




In [5]:
itdataset = dataset.to_iterable_dataset()

In [67]:
n = 44964//25
requests_sent = 0
requests_received = 0

writes_started = 0
written_batches = 0

starting_batch = 0

In [68]:
from itertools import islice
from more_itertools import grouper
async def mistype_questions(mistyped_questions_request: list[MistypeQuestionsRequest], session: ClientSession) -> list[MistypeQuestionsResponse]:
    """Mistype questions."""
    resp = await session.post(
        "http://localhost:8070/generate/mistype-questions",
        json=[mistyped_question_request.model_dump() for mistyped_question_request in mistyped_questions_request],
    )
    resp.raise_for_status()
    resp = await resp.json()
    mistyped_questions_batch = [MistypeQuestionsResponse(**question) for question in resp]
    print(resp)
    print(mistyped_questions_batch)
    if any(question.code==429 for question in mistyped_questions_batch):
        print("Rate limit hit, waiting for 10 seconds")
        await asyncio.sleep(10)
        print("Finished waiting, retrying")
        return await mistype_questions(mistyped_questions_request, session)
    return mistyped_questions_batch


async def write_mistyped_questions(group_of_questions: list[dict[str, str]], outpath: str, session: ClientSession):
    """Write mistyped questions to a file in an async manner."""
    mistype_questions_request = [
        MistypeQuestionsRequest(language="da", model_card="azure", questions=[question.get("question")])
        for question in group_of_questions
        if question is not None
    ]
    global requests_sent
    global requests_received
    global writes_started
    global written_batches
    print(f"Request to be sent: {requests_sent}/{n}")
    requests_sent += 1
    res = await mistype_questions(mistype_questions_request, session=session)
    print(f"Request received: {requests_received}/{n}")
    requests_received += 1
    if not res:
        print("Request - None")
        return None
    questions = []
    for question, mistyped_question in zip(group_of_questions, res):
        question["question"] = mistyped_question.mistyped_questions[0].mistyped_question
        question = json.dumps(question)
        questions.append(question)
    print(f"Writing batch: {writes_started}/{n}")
    writes_started += 1
    async with aiofiles.open(outpath, mode="a") as a_outfile:
        type(a_outfile)
        await a_outfile.write("\n".join(questions) + "\n")
    print(f"Written batch: {written_batches}/{n}")
    written_batches += 1


async def mistype_and_write_questions(questions_dataset: Dataset, outpath: str):
    """Mistype questions and write them to a file in an async manner."""
    async with ClientSession() as session:
        global starting_batch
        write_tasks = []
        for i, batch in islice(
            enumerate(grouper(questions_dataset, 25)), starting_batch, None
        ):
            print(i, "batches of questions gathered")
            write_tasks.append(
                write_mistyped_questions(
                    group_of_questions=batch,
                    outpath=outpath,
                    session=session,
                )
            )
            if (i + 1) % 4 == 0:
                print("running batches from", i - 3, "to", i)
                try:
                    await asyncio.gather(*write_tasks)
                    write_tasks.clear()
                    print("Next batches", i+1, "to", i+4)
                except Exception as e:
                    print(e)
                    print("ERROR, we finished at the batch number", i)
                    starting_batch = i-3
                    raise e
# run the above function until it finishes without exceptions

In [69]:
start,end = 0, None
while True:
    try:
        global starting_batch
        print("Starting batch:", starting_batch)
        await mistype_and_write_questions(islice(itdataset,start,end), outpath = "mistype_questions.jsonl")
        break
    except Exception as e:
        print(e)
        print("ERROOOOOR")
        print("We will try again")
        print("Starting batch:", starting_batch)
        print("Requests sent:", requests_sent)
        print("Requests received:", requests_received)
        print("Writes started:", writes_started)
        print("Written batches:", written_batches)
        continue

Starting batch: 0
0 batches of questions gathered
1 batches of questions gathered
2 batches of questions gathered
3 batches of questions gathered
running batches from 0 to 3
Request to be sent: 0/1798
Request to be sent: 1/1798
Request to be sent: 2/1798
Request to be sent: 3/1798
[{'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvordan håndltere man jord og grundnand'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er en byo mmdannelse'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er den økonomoske krisre'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'huor mange nye tilfacede af nyrekraeft'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er jeres

In [None]:
start = 0
end = None
for el in more_itertools.grouper(islice(itdataset, start, end), 3):
    print(el)
    print(len(el))

In [29]:
l = [{'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvordan håndeter man jord og grundvand'}], 'num_questions': 1, 'message': None, 'code': 408}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad sr en byomdannelse'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er den økobomiske kriise'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvor har mange nye tilf%C3%A6lde av nyrekr%C3%A6ft'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er jeres vedt?gter for en byggeforening'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad et reglerne for at undgå støv'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvoran implementerer megh masterplanen pæ ældreområdet'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'jeg har ikke fqt et erklæring med til min grundovsceremoni'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvrodan dimensiones en faskine'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad gøe jer hvad min støtte ophører'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'jeg har fået ny ovrenskomst'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvrdan samarger i om kriminolitetsforebngelse'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er hjernens hjertet'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'er der en risiko for vardifuldt grundvand'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvilke arbejdmijøgrupper er med i aftalen'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er en frivilliglounge'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad beuder korttidesvanndkvalitetskriteriet'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'røntgen af tæender'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvikle aktivtiteter er der i legen'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hovdan får jeg en el-ladestanderr'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvor kan jeg see natur guide sam'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'revisors opgaver'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad er roglerne for låån til beboerindsekud'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'hvad e jeg dansk ejendoms indhojd'}], 'num_questions': 1, 'message': None, 'code': None}, {'language': 'Danish', 'mistyped_questions': [{'mistyped_question': 'vade er en væsentlig afvejelse mellem det korrigerede budget o regnskabet'}], 'num_questions': 1, 'message': None, 'code': None}]
o = [MistypeQuestionsResponse(**question) for question in l]
o[0].mistyped_questions[0].mistyped_question

MistypeQuestionsResponse(language='Danish', mistyped_questions=[MistypedQuestion(mistyped_question='hvordan håndeter man jord og grundvand')], num_questions=1, message=None, code=408)

In [None]:
# test mistype_and_write_questions
outfile_path = "mistyped_questions.json"
with open(outfile_path, "w") as outfile:
    pass
asyncio.run(mistype_and_write_questions(dataset[:1], outfile))



In [None]:
# test mistype_questions
mistype_questions_request1 = MistypeQuestionsRequest(
    language="da", model_card = "azure", questions=["Hvad er dit navn?"]
)
mistype_questions_request2 = MistypeQuestionsRequest(
    language="da", model_card = "azure", questions=["Hvad er din alder?"]
)
async with ClientSession() as session:
    res = await mistype_questions([mistype_questions_request1, mistype_questions_request2], session)
    print(res)

In [None]:
# test write_mistyped_questions
async with ClientSession() as session:
    question = dataset[0]
    with open("mistyped_questions.json", "w") as outfile:
        await write_mistyped_questions(question, outfile, session)

In [None]:
with open("./src/test.jsonl") as outfile:
    asyncio.run(mistype_and_write_questions(questions_dataset=dataset, write_file=outfile))

In [None]:
curl -N --location 'http://localhost:8070/generate/mistype-questions' --header 'Content-Type: application/json' --data '[{
  "questions": [
    "What are the basic principles of photosynthesis?",
    "How does blockchain technology work, and what are its potential applications?",
    "What were the main causes of World War II?",
    "Where is the Great Barrier Reef located, and why is it important?",
    "What are the major themes in Shakespeares play Hamlet?",
    "How do you calculate the area of a triangle given its base and height?",
    "What are the benefits of regular exercise for cardiovascular health?",
    "What factors contribute to inflation, and how does it affect the economy?",
    "What is the role of the United Nations in promoting international peace and security?",
    "How does traditional Japanese tea ceremony reflect Japanese cultural values?"
],
  "model_card":"azure",
  "language":"en"
}]'

In [None]:
session = ClientSession()
resp = await session.post(
    "http://localhost:8070/generate/mistype-questions",
    json=[{
        "questions": [
            "What are the basic principles of photosynthesis?",
            "How does blockchain technology work, and what are its potential applications?",
            "What were the main causes of World War II?",
            "Where is the Great Barrier Reef located, and why is it important?",
            "What are the major themes in Shakespeares play Hamlet?",
            "How do you calculate the area of a triangle given its base and height?",
            "What are the benefits of regular exercise for cardiovascular health?",
            "What factors contribute to inflation, and how does it affect the economy?",
            "What is the role of the United Nations in promoting international peace and security?",
            "How does traditional Japanese tea ceremony reflect Japanese cultural values?"
        ],
        "model_card": "azure",
        "language": "en"
    }]
)

In [None]:
await resp.text()