In [None]:
import json
import random
import ollama
import time
import os
from tqdm.notebook import tqdm
from google.cloud import storage
from google.auth.exceptions import DefaultCredentialsError

In [None]:
# Downloading google cloud wikipedia buckets locally
def download_from_gcs(bucket_name, blob_name, destination_file_name):
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(destination_file_name)
        print(f"Downloaded {blob_name} to {destination_file_name}")
    except DefaultCredentialsError as e:
        print("Google Cloud credentials not found. Please set GOOGLE_APPLICATION_CREDENTIALS.")
        raise e

BUCKET_NAME = "group-1-landing-lets-talk"
blob_name = "wikipedia/film_pages.json"
destination = "../data/wikipedia_film_pages.json"
download_from_gcs(BUCKET_NAME, blob_name, destination)

blob_name = "wikipedia/sport_pages.json"
destination = "../data/wikipedia_sport_pages.json"
download_from_gcs(BUCKET_NAME, blob_name, destination)

blob_name = "wikipedia/tech_pages.json"
destination = "../data/wikipedia_tech_pages.json"
download_from_gcs(BUCKET_NAME, blob_name, destination)

Downloaded wikipedia/film_pages.json to ../data/wikipedia_film_pages.json
Downloaded wikipedia/sport_pages.json to ../data/wikipedia_sport_pages.json
Downloaded wikipedia/tech_pages.json to ../data/wikipedia_tech_pages.json


In [5]:
# Load the Wikipedia data files
def load_json_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

In [3]:
# Checkpoint management functions
def save_checkpoint(processed_data, output_qa_pairs, checkpoint_file='qa_generation_checkpoint.json'):
    """Save progress to a checkpoint file"""
    checkpoint = {
        'processed_data': processed_data,
        'qa_pairs': output_qa_pairs
    }
    with open(checkpoint_file, 'w', encoding='utf-8') as f:
        json.dump(checkpoint, f, indent=4)
    print(f"Checkpoint saved: {len(processed_data)} articles processed, {len(output_qa_pairs)} QA pairs generated.")

def load_checkpoint(checkpoint_file='qa_generation_checkpoint.json'):
    """Load progress from a checkpoint file"""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r', encoding='utf-8') as f:
            checkpoint = json.load(f)
        print(f"Checkpoint loaded: {len(checkpoint['processed_data'])} articles already processed, {len(checkpoint['qa_pairs'])} QA pairs already generated.")
        return checkpoint['processed_data'], checkpoint['qa_pairs']
    else:
        print("No checkpoint found. Starting from scratch.")
        return [], []
 

In [4]:
def count_articles_in_json_files():
    """Count and display the number of articles in each JSON file"""
    json_files = {
        'Film': 'wikipedia_film_pages.json',
        'Sport': 'wikipedia_sport_pages.json',
        'Tech': 'wikipedia_tech_pages.json'
    }
    
    print("===== ARTICLE COUNT IN JSON FILES =====")
    total_count = 0
    
    for category, filename in json_files.items():
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
                article_count = len(data)
                total_count += article_count
                print(f"{category}: {article_count} articles")
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    
    print(f"Total articles across all files: {total_count}")
    print("========================================")
    
    return total_count


In [7]:
# Create a function to generate Q&A pairs using Ollama with Gemma 3
def generate_qa_pairs(article, category, num_pairs):
    """Generate Reddit-style Q&A pairs based on Wikipedia article content"""
    
    # Create a high-quality prompt for generating questions and answers
    prompt = f"""You are an expert in {category} and a highly knowledgeable Wikipedia editor. 
    
Based on the following Wikipedia article about "{article['title']}", generate {num_pairs} pairs of questions and answers that might appear on r/Ask{category.capitalize()}.

The questions should:
- Be naturally curious and conversational in tone (like genuine Reddit questions)
- Focus on interesting facts or concepts mentioned in the article
- Range from beginner to more advanced knowledge levels
- Be diverse in their focus (not all about the same subtopic)

The answers should:
- Be informative and factually accurate based EXCLUSIVELY on the Wikipedia content
- Include specific details from the article (not general knowledge)
- Have a helpful, somewhat casual tone like a knowledgeable Reddit user
- Be 3-5 sentences in length
- NOT include any personal opinions or claims not supported by the article

Here's the article content:
{article['content'][:3000]}  # Limiting content length for LLM processing

Format your response exactly as shown below:
```
Q1: [Question 1]
A1: [Answer 1]

Q2: [Question 2]
A2: [Answer 2]

Q3: [Question 3]
A3: [Answer 3]
```
ONLY return the formatted Q&A pairs, no introduction or additional text."""

    # Handle rate limiting with exponential backoff
    max_retries = 5
    base_wait = 2
    
    for attempt in range(max_retries):
        try:
            # Call Ollama with Gemma 3 model
            response = ollama.chat(
                model='gemma3:latest',  # Changed to gemma3:latest
                messages=[{'role': 'user', 'content': prompt}]
            )
            
            # Extract the response content
            qa_text = response['message']['content']
            
            # Parse the QA pairs
            questions = []
            answers = []
            
            lines = qa_text.strip().split('\n')
            current_q = None
            
            for line in lines:
                line = line.strip()
                if line.startswith('Q') and ':' in line:
                    current_q = line[line.index(':')+1:].strip()
                    questions.append(current_q)
                elif line.startswith('A') and ':' in line and current_q:
                    answer = line[line.index(':')+1:].strip()
                    answers.append(answer)
                    current_q = None
            
            # Return structured data
            if len(questions) == len(answers) and len(questions) > 0:
                return {
                    'article_title': article['title'],
                    'article_content': article['content'][:500] + '...',  # Store a preview of the content
                    'category': category,
                    'qa_pairs': [
                        {'q': q, 'a': a} for q, a in zip(questions, answers)
                    ]
                }
            else:
                print(f"Warning: Mismatch in questions and answers or empty response for {article['title']}")
                return None
            
        except Exception as e:
            wait_time = base_wait ** attempt
            print(f"Error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    
    # If all retries fail
    return None

In [None]:

# Function to generate a batch of QA pairs from each category
def generate_qa_batch(num_articles_per_category=3, pairs_per_article=3, checkpoint_interval=5):
    # Load checkpoint if exists
    processed_article_ids, all_qa_pairs = load_checkpoint()
    
    # Track which articles have been processed
    processed_ids_set = set(processed_article_ids)
    
    # Load data here to avoid NameError
    film_data = load_json_data('../data/wikipedia_film_pages.json')
    sport_data = load_json_data('../data/wikipedia_sport_pages.json')
    tech_data = load_json_data('../data/wikipedia_tech_pages.json')
    
    categories = {
        'film': film_data,
        'sport': sport_data,
        'tech': tech_data
    }
    
    total_articles = sum(min(num_articles_per_category, len(data)) for _, data in categories.items())
    total_processed = len(processed_ids_set)
    
    # Create overall progress bar
    main_pbar = tqdm(total=total_articles, initial=total_processed, 
                     desc="Overall Progress", position=0)
    
    # Process each category
    try:
        for category, data in categories.items():
            print(f"\nGenerating QA pairs for {category} category...")
            
            # Filter out already processed articles
            unprocessed_data = [article for article in data 
                              if article['title'] not in processed_ids_set]
            
            # Select random articles from unprocessed data
            num_to_select = min(num_articles_per_category - sum(1 for id in processed_ids_set 
                                                            if any(article['title'] == id for article in data)), 
                           len(unprocessed_data))
            
            if num_to_select <= 0:
                print(f"All requested {category} articles already processed. Skipping.")
                continue
                
            selected_articles = random.sample(unprocessed_data, num_to_select)
            
            # Process each article
            for article in selected_articles:
                print(f"\nProcessing: {article['title']} ({category})")
                
                qa_result = generate_qa_pairs(article, category, num_pairs=pairs_per_article)
                if qa_result:
                    all_qa_pairs.append(qa_result)
                
                # Mark as processed
                processed_ids_set.add(article['title'])
                processed_article_ids.append(article['title'])
                
                # Update progress
                main_pbar.update(1)
                
                # Save checkpoint periodically
                if len(processed_article_ids) % checkpoint_interval == 0:
                    save_checkpoint(processed_article_ids, all_qa_pairs)
                
                # Add a small delay to avoid overwhelming the API
                time.sleep(1)
        
        # Final checkpoint save
        save_checkpoint(processed_article_ids, all_qa_pairs)
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving checkpoint...")
        save_checkpoint(processed_article_ids, all_qa_pairs)
        raise
    except Exception as e:
        print(f"\nError occurred: {e}. Saving checkpoint...")
        save_checkpoint(processed_article_ids, all_qa_pairs)
        raise
    finally:
        main_pbar.close()
    
    return all_qa_pairs

In [None]:
# Main execution block
if __name__ == "__main__":
    # Parameters
    ARTICLES_PER_CATEGORY = 500  # How many articles to process from each category
    PAIRS_PER_ARTICLE = 2      # How many Q&A pairs to generate per article
    CHECKPOINT_INTERVAL = 2    # Save checkpoint after processing this many articles
    
    # Count articles first
    count_articles_in_json_files()
    
    print("\nStarting Wikipedia Q&A generation process...")
    
    # Generate QA pairs
    qa_pairs = generate_qa_batch(
        num_articles_per_category=ARTICLES_PER_CATEGORY, 
        pairs_per_article=PAIRS_PER_ARTICLE,
        checkpoint_interval=CHECKPOINT_INTERVAL
    )
    
    # Save the QA pairs to a JSON file
    output_file = 'wikipedia_qa_pairs.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(qa_pairs, f, indent=4)
    
    total_qa_count = sum(len(article['qa_pairs']) for article in qa_pairs)
    print(f"\nProcess complete!")
    print(f"- Processed {len(qa_pairs)} articles")
    print(f"- Generated {total_qa_count} Q&A pairs total")
    print(f"- Saved to {output_file}")
    
    # Print a sample QA pair from each category
    print("\n===== SAMPLE OUTPUTS =====")
    categories = ['film', 'sport', 'tech']
    for category in categories:
        category_pairs = [p for p in qa_pairs if p['category'] == category]
        if category_pairs:
            sample = random.choice(category_pairs)
            print(f"\n--- Sample {category.upper()} Q&A ---")
            print(f"Article: {sample['article_title']}")
            if sample['qa_pairs'] and len(sample['qa_pairs']) > 0:
                rand_pair = random.choice(sample['qa_pairs'])
                print(f"Q: {rand_pair['q']}")
                print(f"A: {rand_pair['a']}")
            print("-" * 50)

In [None]:
# Function to replace truncated content with full content
def update_json_with_full_content(qa_pairs_file='wikipedia_qa_pairs.json'):
    # First, check title uniqueness
    print("Checking title uniqueness across datasets...")
    film_data = load_json_data('wikipedia_film_pages.json')
    sport_data = load_json_data('wikipedia_sport_pages.json')
    tech_data = load_json_data('wikipedia_tech_pages.json')
    
    # Create a title to content mapping
    title_to_content = {}
    
    # Check for duplicates first
    all_titles = []
    for dataset in [film_data, sport_data, tech_data]:
        for article in dataset:
            all_titles.append(article['title'])
    
    unique_titles = set(all_titles)
    if len(unique_titles) != len(all_titles):
        print(f" Warning: Found {len(all_titles) - len(unique_titles)} duplicate titles across datasets.")
        # Find and print the duplicates
        title_counts = {}
        for title in all_titles:
            title_counts[title] = title_counts.get(title, 0) + 1
        
        duplicates = {title: count for title, count in title_counts.items() if count > 1}
        print(f"Duplicate titles: {', '.join(duplicates.keys())}")
        decision = input("Duplicates found. Continue anyway? (y/n): ")
        if decision.lower() != 'y':
            print("Operation cancelled.")
            return
    else:
        print(f" All {len(all_titles)} article titles are unique across datasets.")
    
    # Build the mapping
    for dataset in [film_data, sport_data, tech_data]:
        for article in dataset:
            if article['title'] not in title_to_content:  # In case of duplicates, take the first one
                title_to_content[article['title']] = article['content']
    
    # Load the QA pairs file
    print(f"Loading QA pairs from {qa_pairs_file}...")
    with open(qa_pairs_file, 'r', encoding='utf-8') as f:
        qa_pairs = json.load(f)
    
    # Replace truncated content with full content
    updates = 0
    for article in qa_pairs:
        if article['article_title'] in title_to_content:
            # Check if content is truncated (ends with "...")
            if article['article_content'].endswith('...'):
                article['article_content'] = title_to_content[article['article_title']]
                updates += 1
    
    # Save the updated file
    backup_file = qa_pairs_file + '.backup'
    print(f"Creating backup of original file as {backup_file}")
    import shutil
    shutil.copy2(qa_pairs_file, backup_file)
    
    updated_file = qa_pairs_file.replace('.json', '_full_content.json')
    print(f"Saving updated content to {updated_file}")
    with open(updated_file, 'w', encoding='utf-8') as f:
        json.dump(qa_pairs, f, indent=4)
    
    print(f" Updated {updates} articles with full content")
    print(f"Original file backed up to: {backup_file}")
    print(f"Updated file saved to: {updated_file}")

# Use this function to update your existing QA pairs JSON with full content
update_json_with_full_content()

Checking title uniqueness across datasets...
✅ All 1500 article titles are unique across datasets.
Loading QA pairs from wikipedia_qa_pairs.json...
Creating backup of original file as wikipedia_qa_pairs.json.backup
Saving updated content to wikipedia_qa_pairs_full_content.json
✅ Updated 1500 articles with full content
Original file backed up to: wikipedia_qa_pairs.json.backup
Updated file saved to: wikipedia_qa_pairs_full_content.json
