In [None]:
!pip install datasets
!pip install openai
!pip install wikipedia
!pip install wikipedia-api
!pip install feedparser



In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import time
import pandas as pd
import numpy as np
from datasets import load_dataset
import feedparser
import datetime
import time
#from data.prompts import prompts
import requests
import wikipediaapi
import re


from openai import OpenAI
client = OpenAI()

In [None]:
import os
from data.prompts import prompts
cache_dir = ""
os.environ["HF_DATASETS_CACHE"] = cache_dir

# Now load the dataset using the specified cache directory
wiki = load_dataset("wikimedia/wikipedia", "20231101.en", cache_dir=cache_dir)

In [None]:
wiki_api = wikipediaapi.Wikipedia('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36')

def get_articles_on_topic(topic):
    page_py = wiki_api.page(topic)
    if not page_py.exists():
        return []
    linked_pages = page_py.links
    articles = list(linked_pages.keys())

    return articles

In [None]:
historical_article_names = [y.lower() for y in get_articles_on_topic('History') if ":" not in y]

In [None]:
filtered = wiki['train'].filter(lambda x: x['title'].lower() in historical_article_names)

Filter:   0%|          | 0/6407814 [00:00<?, ? examples/s]

In [None]:
subset = wiki['train'].select(np.random.choice(6407814, 2000))

In [None]:
wiki_df = pd.DataFrame(subset)

In [None]:
print(prompts.QUESTION_GENERATOR['years'])


        Given the following document, please generate a question and answer based on the document.
    
        The question MUST contain all information and context necessary to answer without the document.
    
        In your output, include the phrase from the document that contains the answer to the question as 'context'.
        This phrase MUST be copied verbatim, word for word, from the document. 
        You must produce the context phrase exactly from the text, with no modifications or truncations.
        This phrase should be short (one sentence).
    
        You must obey the following criteria:
        - The question MUST ask the year of the occurrence of an event (such as a birthday, war, founding, etc.). 
          Do not produce a question that does not ask for the year.
        - The year MUST adhere to YYYY format.
        - The question MUST be detailed and be based explicitly on information in the document.
        - The answer should only be mentioned sparingly 

In [None]:
row = wiki_df.iloc[0]
context = row['text']

In [None]:
response = client.chat.completions.create(
  model="gpt-4o",
  response_format={"type": 'json_object'},
  messages=[
    {"role": "system", "content": prompts.QUESTION_GENERATOR['years']},
    {"role": "user", "content": f"<Begin Document>\n{context}\n<End Document>"}
  ],
  temperature=0,
  seed=0,
)

In [None]:
qa_dict = json.loads(response.choices[0].message.content)
qa_dict

{'Question': 'In which year did Subhash Sureshchandra Deshmukh take oath as a Maharashtra Cabinet Minister?',
 'Answer': '2016',
 'Context': 'He took oath as Maharashtra Cabinet Minister on 7 July 2016.'}

In [None]:
# We will collect 600 valid QA pairs.
target_pairs = 600
qa_pairs = []
num_attempts = 0

# Loop over rows from wiki_df until we have 600 valid pairs.
for idx, row in wiki_df.iterrows():
    if len(qa_pairs) >= target_pairs:
        break
    num_attempts += 1
    context = row['text']
    document_str = f"<Begin Document>\n{context}\n<End Document>"

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": prompts.QUESTION_GENERATOR['years']},
                {"role": "user", "content": document_str}
            ],
            temperature=0,
            seed=0,
        )
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        continue

    try:
        qa_dict = json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"JSON parse error at index {idx}: {e}")
        continue

    # Check that the returned Question is not "None"
    if qa_dict.get("Question", "None").strip().lower() == "none":
        continue

    # Check that the returned Answer is a valid year (4 digits only).
    answer = qa_dict.get("Answer", "").strip()
    if not re.fullmatch(r"\d{4}", answer):
        continue

    qa_pairs.append(qa_dict)
    print(f"Collected pair {len(qa_pairs)} (attempt {num_attempts})")
    time.sleep(0.5)

print(f"Collected {len(qa_pairs)} QA pairs after {num_attempts} attempts.")

# Save the collected QA pairs to a JSON file with a timestamp.
save_path = f"data/dataset/year_qa_pairs_{target_pairs}.json"

with open(save_path, "w") as f:
    json.dump(qa_pairs, f, indent=2)

Collected pair 1 (attempt 1)
Collected pair 2 (attempt 3)
Collected pair 3 (attempt 4)
Collected pair 4 (attempt 5)
Collected pair 5 (attempt 6)
Collected pair 6 (attempt 8)
Collected pair 7 (attempt 9)
Collected pair 8 (attempt 10)
Collected pair 9 (attempt 14)
Collected pair 10 (attempt 15)
Collected pair 11 (attempt 16)
Collected pair 12 (attempt 19)
Collected pair 13 (attempt 20)
Collected pair 14 (attempt 21)
Collected pair 15 (attempt 22)
Collected pair 16 (attempt 24)
Collected pair 17 (attempt 25)
Collected pair 18 (attempt 26)
Collected pair 19 (attempt 27)
Collected pair 20 (attempt 28)
Collected pair 21 (attempt 30)
Collected pair 22 (attempt 31)
Collected pair 23 (attempt 32)
Collected pair 24 (attempt 34)
Collected pair 25 (attempt 35)
Collected pair 26 (attempt 36)
Collected pair 27 (attempt 37)
Collected pair 28 (attempt 38)
Collected pair 29 (attempt 39)
Collected pair 30 (attempt 40)
Collected pair 31 (attempt 41)
Collected pair 32 (attempt 42)
Collected pair 33 (attem