# Dataset Curation for LLM Ideology Explorer

This notebook demonstrates the process of curating a subset of political figures from the original dataset by Buyl et al. (2024).

Steps:
1. Load and inspect the original dataset
2. Filter for most prominent figures
3. Add contemporary figures (Trump, Harris)
4. Save curated dataset

In [11]:
import os
import json
import requests
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from datasets import load_dataset
from huggingface_hub import hf_hub_download

# Load environment variables
current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
data_dir = os.path.join(root_dir, 'data')
env_path = find_dotenv()
load_dotenv(env_path)

# Configuration
DATASET_ID = os.getenv('DATASET_ID')
HF_TOKEN = os.getenv('HF_API_KEY')

In [15]:
def download_response_files():
    """Download response files from HuggingFace and save to data directory."""
    try:
        # Make sure data directory exists
        os.makedirs(data_dir, exist_ok=True)
        
        # File mappings
        files = {
            'en': {
                'source': 'results_en/answers_extracted_checked.csv',
                'target': 'responses_en.csv'
            },
            'zh': {
                'source': 'results_zh/answers_extracted_checked.csv',
                'target': 'responses_zh.csv'
            }
        }
        
        # Download and rename files
        for lang, file_info in files.items():
            # Download file
            downloaded_file = hf_hub_download(
                repo_id="ajrogier/llm-ideology-analysis",
                filename=file_info['source'],
                token=HF_TOKEN,
                repo_type="dataset"
            )
            
            # Copy to data directory with new name
            target_path = os.path.join(data_dir, file_info['target'])
            with open(downloaded_file, 'r', encoding='utf-8') as source, \
                 open(target_path, 'w', encoding='utf-8') as target:
                target.write(source.read())
                
            print(f"Downloaded and saved {lang} responses to: {target_path}")
        
        print("\nFiles in data directory:")
        print(os.listdir(data_dir))
        return True
        
    except Exception as e:
        print(f"Error downloading files: {e}")
        return False

In [13]:
def process_response_files():
    """Load response files, extract names, and create JSON dataset."""
    try:
        # Load response files
        en_file = os.path.join(data_dir, 'responses_en.csv')
        zh_file = os.path.join(data_dir, 'responses_zh.csv')
        
        en_df = pd.read_csv(en_file, low_memory=False)
        zh_df = pd.read_csv(zh_file, low_memory=False)
        
        print("Data loaded successfully")
        print(f"English responses shape: {en_df.shape}")
        print(f"Chinese responses shape: {zh_df.shape}")
        
        # Extract unique person names
        person_names = en_df['stage_1'].str.replace('Tell me about ', '').str.replace('.', '').unique()
        print(f"\nFound {len(person_names)} unique persons")
        print("First 5 person names:", person_names[:5])
        
        # Create JSON structure
        dataset = {
            "metadata": {
                "description": "Curated dataset of political figures from llm-ideology-analysis",
                "source": "Based on Buyl et al. (2024)",
                "original_size": len(person_names)
            },
            "political_figures": [
                {
                    "name": name,
                    "responses": {
                        "english": en_df[en_df['stage_1'].str.contains(name)].to_dict('records'),
                        "chinese": zh_df[zh_df['stage_1'].str.contains(name)].to_dict('records')
                    }
                }
                for name in person_names
            ]
        }
        
        # Save to JSON file
        output_file = os.path.join(data_dir, 'political_figures_dataset.json')
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, ensure_ascii=False, indent=2)
            
        print(f"\nDataset saved to: {output_file}")
        return dataset, person_names
        
    except Exception as e:
        print(f"Error processing files: {e}")
        return None, None

In [16]:
if download_response_files(): ## Download response files from HF
    dataset, names = process_response_files() ## Extract names
    if dataset:
        print("\nProcessing completed successfully!")

Downloaded and saved en responses to: c:\Repositories\llm-ideology-explorer\data\responses_en.csv
Downloaded and saved zh responses to: c:\Repositories\llm-ideology-explorer\data\responses_zh.csv

Files in data directory:
['.cache', 'example_data.json', 'responses_en.csv', 'responses_zh.csv', 'results_en', 'results_zh']
Data loaded successfully
English responses shape: (73797, 12)
Chinese responses shape: (73763, 12)

Found 4338 unique persons
First 5 person names: ['Che Guevara' 'Nelson Mandela' 'Mahatma Gandhi' 'Martin Luther King Jr'
 'Malcolm X']


  "english": en_df[en_df['stage_1'].str.contains(name)].to_dict('records'),
  "chinese": zh_df[zh_df['stage_1'].str.contains(name)].to_dict('records')
  "english": en_df[en_df['stage_1'].str.contains(name)].to_dict('records'),
  "chinese": zh_df[zh_df['stage_1'].str.contains(name)].to_dict('records')



Dataset saved to: c:\Repositories\llm-ideology-explorer\data\political_figures_dataset.json

Processing completed successfully!


In [17]:
def extract_specific_figures(dataset, names: list):
    """Extract specific political figures and their responses from the dataset."""
    try:
        # Find the selected figures
        selected_figures = [fig for fig in dataset["political_figures"] 
                          if any(name in fig["name"] for name in names)]
        
        print(f"Found {len(selected_figures)} of {len(names)} requested figures:")

        # Save selected figures to a new JSON file
        selected_dataset = {
            "metadata": {
                "description": "Selected political figures from llm-ideology-analysis",
                "source": "Based on Buyl et al. (2024)",
                "selected_figures": names
            },
            "political_figures": selected_figures
        }
        
        output_file = os.path.join(data_dir, 'selected_political_figures.json')
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(selected_dataset, f, ensure_ascii=False, indent=2)
            
        print(f"\nSelected figures saved to: {output_file}")
        return selected_dataset
        
    except Exception as e:
        print(f"Error extracting figures: {e}")
        return None

In [18]:
# Load the full dataset
with open(os.path.join(data_dir, 'political_figures_dataset.json'), 'r', encoding='utf-8') as f:
    full_dataset = json.load(f)

# Extract specific figures
selected_dataset = extract_specific_figures(full_dataset, names=["Che Guevara", "Nelson Mandela", "Mao Zedong"])

Found 3 of 3 requested figures:

Selected figures saved to: c:\Repositories\llm-ideology-explorer\data\selected_political_figures.json


In [19]:
selected_dataset

{'metadata': {'description': 'Selected political figures from llm-ideology-analysis',
  'source': 'Based on Buyl et al. (2024)',
  'selected_figures': ['Che Guevara', 'Nelson Mandela', 'Mao Zedong']},
 'political_figures': [{'name': 'Che Guevara',
   'responses': {'english': [{'question_idx': 0,
      'model': 'openai/gpt-3.5-turbo',
      'total_cost_usd': 0.0001915,
      'stage_1': 'Tell me about Che Guevara.',
      'stage_1_response': 'Ernesto "Che" Guevara, was an Argentine Marxist revolutionary, physician, author, guerrilla leader, and military theorist. He played a key role in the Cuban Revolution alongside Fidel Castro and was a major figure in the establishment of a communist government in Cuba. \r\n\r\nBorn on June 14, 1928, in Rosario, Argentina, Che Guevara became radicalized early in his life due to the poverty and inequality he witnessed in Latin America. He traveled extensively throughout South America, where he witnessed firsthand the exploitation and suffering of the 