# Step 1: Import Libraries and Set Up Paths

In [1]:
import pandas as pd
import json
import os
from pathlib import Path

# Path to the JSON file
references_path = "../data/internal-references-pdftotext.json"
output_path = "../data/internal-references-pdftotext.parquet"

print(f"Reading file: {references_path}")
print(f"File size: {os.path.getsize(references_path) / (1024 * 1024):.2f} MB")

Reading file: ../data/internal-references-pdftotext.json
File size: 125.57 MB


## Step 2: Examine the JSON Structure
Let's first look at the structure of the JSON file to understand what we're working with.

In [2]:
# Read the first few lines of the file to understand its structure
with open(references_path, 'r') as f:
    first_chars = f.read(1000)  # Read first 1000 characters
    
print("First 1000 characters of the file:")
print(first_chars)

First 1000 characters of the file:
{"plasm-ph/9607002": [], "plasm-ph/9607001": [], "plasm-ph/9512001": [], "plasm-ph/9512002": [], "plasm-ph/9503001": [], "plasm-ph/9503002": [], "plasm-ph/9604002": [], "plasm-ph/9604001": [], "plasm-ph/9604003": [], "plasm-ph/9507002": [], "plasm-ph/9507001": [], "plasm-ph/9602001": [], "plasm-ph/9602003": [], "plasm-ph/9602002": [], "plasm-ph/9608002": [], "plasm-ph/9608001": [], "plasm-ph/9609002": [], "plasm-ph/9609001": [], "plasm-ph/9511002": [], "plasm-ph/9511001": [], "plasm-ph/9502001": [], "plasm-ph/9502002": [], "plasm-ph/9502003": [], "plasm-ph/9506003": [], "plasm-ph/9506004": [], "plasm-ph/9506001": [], "alg-geom/9211001": [], "alg-geom/9209001": [], "alg-geom/9412013": [], "alg-geom/9412011": [], "alg-geom/9412002": [], "alg-geom/9412007": [], "alg-geom/9412010": ["alg-geom/9509005"], "alg-geom/9412020": ["alg-geom/9505009"], "alg-geom/9412012": ["alg-geom/9407002"], "alg-geom/9412009": [], "alg-geom/9412022": [], "alg-geom/9412005": []

## Step 3: Load the JSON Data
Now let's try to load the entire JSON file. If it's too large, we'll handle that in the next steps.

In [3]:
try:
    # Read the entire file as a single JSON object
    with open(references_path, 'r') as f:
        data = json.load(f)
    
    print(f"Successfully loaded JSON with {len(data)} entries")
    
    # Display a sample of the data
    sample_items = list(data.items())[:5]
    print("\nSample data (first 5 items):")
    for source_id, target_ids in sample_items:
        print(f"{source_id}: {target_ids}")
        
except json.JSONDecodeError as e:
    print(f"JSON decode error: {e}")
    data = None

Successfully loaded JSON with 1354753 entries

Sample data (first 5 items):
plasm-ph/9607002: []
plasm-ph/9607001: []
plasm-ph/9512001: []
plasm-ph/9512002: []
plasm-ph/9503001: []


## Step 4: Convert to DataFrame
If we successfully loaded the data, let's convert it to a DataFrame format.

In [4]:
if data is not None:
    # Convert the dictionary to a format suitable for a DataFrame
    # Each row will have a 'source_id' and 'target_ids' column
    rows = []
    for source_id, target_ids in data.items():
        rows.append({
            'source_id': source_id,
            'target_ids': target_ids  # This is already a list
        })
    
    # Create DataFrame
    references_df = pd.DataFrame(rows)
    
    print(f"Created DataFrame with shape: {references_df.shape}")
    print("\nSample data:")
    print(references_df.head())
else:
    print("Skipping DataFrame creation as data loading failed")

Created DataFrame with shape: (1354753, 2)

Sample data:
          source_id target_ids
0  plasm-ph/9607002         []
1  plasm-ph/9607001         []
2  plasm-ph/9512001         []
3  plasm-ph/9512002         []
4  plasm-ph/9503001         []


## Step 5: Export to Parquet Format
Now let's export the DataFrame to Parquet format.

In [5]:
if 'references_df' in locals():
    # Export to Parquet
    references_df.to_parquet(output_path, index=False)
    print(f"\nSuccessfully exported to {output_path}")
else:
    print("Skipping Parquet export as DataFrame creation failed")


Successfully exported to ../data/internal-references-pdftotext.parquet


## Step 6: Create Edge List Format (Optional)
For graph analysis, an edge list format can be more useful. Let's create that as well.

In [6]:
if data is not None:
    # Create an edge list format for easier graph processing
    # This will create a row for each source-target pair
    edge_rows = []
    for source_id, target_ids in data.items():
        for target_id in target_ids:
            edge_rows.append({
                'source_id': source_id,
                'target_id': target_id
            })
    
    if edge_rows:
        edge_df = pd.DataFrame(edge_rows)
        edge_output_path = "../data/internal-references-edges.parquet"
        edge_df.to_parquet(edge_output_path, index=False)
        print(f"Also created edge list format at {edge_output_path}")
        print(f"Edge list shape: {edge_df.shape}")
    else:
        print("No edges found in the data")
else:
    print("Skipping edge list creation as data loading failed")

Also created edge list format at ../data/internal-references-edges.parquet
Edge list shape: (6849633, 2)
