In [38]:
import requests
import pandas as pd
import html
import random
import time
from requests.exceptions import RequestException

question_counter = 1  # Global question_id

def fetch_questions(api_url: str, category_id: int, master_df: pd.DataFrame = None, max_retries: int = 5):
    global question_counter
    retries = 0
    wait_time = 5  # seconds

    while retries < max_retries:
        try:
            response = requests.get(api_url, timeout=10)
            response.raise_for_status()

            data = response.json()
            if data.get('response_code') != 0:
                print(f"[WARNING] API returned response_code {data.get('response_code')}. Skipping.")
                return master_df if master_df is not None else pd.DataFrame()
            break  # Success, exit retry loop

        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 429:
                print(f"[WARNING] Rate limit hit (429). Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
                retries += 1
                wait_time *= 2  # exponential backoff
                continue
            else:
                print(f"[ERROR] HTTP error occurred: {http_err}")
                return master_df if master_df is not None else pd.DataFrame()

        except RequestException as req_err:
            print(f"[ERROR] Request failed: {req_err}")
            return master_df if master_df is not None else pd.DataFrame()

        except ValueError:
            print("[ERROR] Failed to parse JSON.")
            return master_df if master_df is not None else pd.DataFrame()

    else:
        print("[ERROR] Max retries exceeded.")
        return master_df if master_df is not None else pd.DataFrame()

    rows = []

    for i, item in enumerate(data.get('results', [])):
        try:
            question_text = html.unescape(item['question']).strip()
            correct_answer = html.unescape(item['correct_answer']).strip()
            incorrect_answers = [html.unescape(ans).strip() for ans in item['incorrect_answers']]

            if not question_text or not correct_answer or len(incorrect_answers) != 3:
                print(f"[SKIP] Invalid question at index {i}")
                continue

            options = incorrect_answers.copy()
            options.insert(random.randint(0, 3), correct_answer)

            row = {
                'question_id': question_counter,
                'question_text': question_text,
                'correct_answer': correct_answer,
                'option1': options[0],
                'option2': options[1],
                'option3': options[2],
                'category_id': category_id,
                'difficulty': item['difficulty']
            }

            rows.append(row)
            question_counter += 1

        except Exception as e:
            print(f"[ERROR] Failed to process question at index {i}: {e}")
            continue

    new_df = pd.DataFrame(rows)

    if master_df is not None:
        combined_df = pd.concat([master_df, new_df], ignore_index=True)
        combined_df.drop_duplicates(subset=['question_text'], inplace=True)
        return combined_df
    else:
        return new_df


In [39]:
# Initialize an empty master DataFrame
master_questions_df = pd.DataFrame(columns=[
    'question_id', 'question_text', 'correct_answer',
    'option1', 'option2', 'option3',
    'category_id', 'difficulty'
])




In [62]:
url = "https://opentdb.com/api.php?amount=45&category=11&type=multiple"

for _ in range(3):
    master_questions_df = fetch_questions(url, category_id=4, master_df=master_questions_df)

master_questions_df.shape



(1259, 8)

In [63]:
master_questions_df.to_csv('questions.csv', index=False, encoding='utf-8')
