In [None]:
import google.generativeai as genai
import os
import typing_extensions as typing
import json
from datasets import load_dataset

In [23]:
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")['train']

In [24]:
class QAPairs(typing.TypedDict):
    question : str
    answer: str
    wrong_answers : list[str]

In [None]:
genai.configure(api_key='')

model = genai.GenerativeModel('gemini-1.5-flash')

few_shot = """
Base on the text, generate as many question answer and wrong answers pairs as possible but in Khmer

Example:

Text:

The tomb of Philippe Pot is a life-sized funerary monument, now on display in the Louvre, Paris. It was commissioned by the military leader and diplomat Philippe Pot around the year 1480, to be used for his burial at the chapel of Saint-Jean-Baptiste in Cîteaux Abbey, Dijon, France. His effigy shows him recumbent on a slab, his hands raised in prayer, and wearing armour and a heraldic tunic. The eight mourners (pleurants) are dressed in black hoods, and act as pallbearers carrying him towards his grave. Pot commissioned the tomb when he was around 52 years old, 13 years before his death in 1493. The detailed inscriptions written on the sides of the slab emphasise his achievements and social standing.

Data generated:

Question 1: អ្នកណាជាអ្នកគាំទ្រនៃសិលាចារឹក Philippe Pot?
Answer 1: ក្រុមនិយាយទោស ៨ នាក់ដែលស្លៀកពាក់ខោមួកពណ៌ខ្មៅ
Wrong answer 1.1: ឈ្មោះចម្លាក់របស់ Louvre
Wrong answer 1.2: ព្រះសង្ឃនៅ Abbey Citeaux

Question 2: តើអ្នកចាំកំព្រាអ្នកស្លាប់នៅពេលណានៅក្នុងការគាំទ្រដល់ Philippe Pot?
Answer 2: ក្នុងឆ្នាំ ១៤៨០
Wrong answer 2.1: ក្នុងឆ្នាំ ១៤៩៣
Wrong answer 2.2: ក្នុងឆ្នាំ ១៤៥០

---------------------------------------------------------------------------------------------------------------------------

Text:



"""

In [None]:
def generate_questions_answers(text_chunk):
    
    response = model.generate_content(
        few_shot + text_chunk,
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json", response_schema=list[QAPairs]
        ),
    )  
    try:
        response_text = response.text
        
        try:
            json_data = json.loads(response_text)
            return json_data
        except json.JSONDecodeError:
            return None
        
    except ValueError:
        return None
    
    

In [None]:

def process_text(text, i, chunk_size=4000):
    text_chunk = text[0 : chunk_size ]
    response = generate_questions_answers(text_chunk)
    
    if response is None: return
    
    with open('responses/' + str(i) + '.json', 'w') as f:
        json.dump(response, f, indent=2)
    
    

In [None]:
for i, data in enumerate(dataset):
    process_text(data['text'], i)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "[{\"question\": \"\u178f\u17be\u1796\u17b6\u1780\u17d2\u1799\u17a2\u1793\u17b6\u1792\u17b7\u1794\u178f\u17c1\u1799\u17d2\u1799\u1798\u1780\u1796\u17b8\u178e\u17b6?\", \"wrong_answers\": [\"\u1796\u17b6\u1780\u17d2\u1799\u1793\u17c1\u17c7\u1798\u1780\u1796\u17b8\u17a1\u17b6\u178f\u17b6\u17c6\u1784\", \"\u1796\u17b6\u1780\u17d2\u1799\u1793\u17c1\u17c7\u1798\u1780\u1796\u17b8\u1794\u17b6\u179a\u17b6\u17c6\u1784\", \"\u1796\u17b6\u1780\u17d2\u1799\u1793\u17c1\u17c7\u1798\u1780\u1796\u17b8\u17a2\u17b6\u179a\u17c9\u17b6\u1794\u17cb \"  ]}, {\"question\": \"\u178f\u17be\u17a2\u1793\u17b6\u1792\u17b7\u1794\u178f\u17c1\u1799\u17d2\u1799\u1782\u17ba\u1787\u17b6\u17a2\u17d2\u179c\u17b8?\", \"wrong_answers\": [\"\u179c\u17b6\u1782\u17ba\u1787\u17b6\u1782\u17c6\u1793\u17

ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.