In [1]:
import requests
import pandas as pd

API_KEY = "YOUR_API_KEY"  
BASE_URL = "https://search.libraryofleaks.org/api/2"
HEADERS = {"Authorization": f"ApiKey {API_KEY}"}



In [2]:
def fetch_collections():
    """Fetch all available collections."""
    url = f"{BASE_URL}/collections"
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response.json()["results"]

def fetch_schemas(collection_id):
    """Fetch schemas available in a specific collection."""
    url = f"{BASE_URL}/collections/{collection_id}"
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    collection_data = response.json()
    schema_values = collection_data.get("statistics", {}).get("schema", {}).get("values", {})
    print("Available schemas in collection:")
    for schema, count in schema_values.items():
        print(f"- {schema} ({count} occurrences)")
    return [{"key": schema, "count": count} for schema, count in schema_values.items()]

def fetch_entities(collection_id, schema, limit=100, offset=0):
    """Fetch entities from a specific collection and schema."""
    url = f"{BASE_URL}/entities"
    params = {
        "collection_id": collection_id,
        "schema": schema,
        "filter:schemata": schema,
        "limit": limit,
        "offset": offset
    }
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()["results"], response.json().get("next")

def fetch_entities(collection_id, schema, limit=100, offset=0, additional_filters=None):
    """Fetch entities from a specific collection with optional filters."""
    url = f"{BASE_URL}/entities"
    params = {
        "collection_id": collection_id,
        "schema": schema,
        "filter:schemata": schema,
        "limit": limit,
        "offset": offset
    }
    if additional_filters:
        params.update(additional_filters)

    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()["results"], response.json().get("next")


def save_entities_to_csv(entities, filename):
    """Save entities to a CSV file."""
    rows = []
    for entity in entities:
        row = {
            "id": entity.get("id"),
            "schema": entity.get("schema"),
            "bodyText": entity["properties"].get("bodyText", [""])[0],  # First text block
            "fileName": entity["properties"].get("fileName", [""])[0],  # First filename
            "mimeType": entity["properties"].get("mimeType", [""])[0],  # First MIME type
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False)
    print(f"Saved {len(rows)} entities to {filename}")


In [3]:
collections = fetch_collections()
print("Available collections:")
for col in collections:
    print(f"- {col['id']}: {col['label']}")

Available collections:
- 13: Afghanistan Papiere
- 18: Airman Teixeira Leaks
- 35: Bahamas Registry
- 1: BlueLeaks
- 29: Chinga La Migra
- 14: Constellis
- 25: Cryptome Archive (2024)
- 15: Documents from US Espionage Den
- 19: FBI’s Secret Rules
- 21: Fraternal Order of Police
- 31: Fuck FBI Friday
- 9: Hillary Clinton emails
- 6: Hunter Biden emails
- 3: Israel Defense Forces (Anonymous For Justice)
- 33: Israel Ministry of Justice
- 4: Jones Day
- 34: Kazakhstan Ministry of Energy
- 11: Metropolitan Police Department D.C.
- 16: MilicoLeaks
- 10: Paramilitary Election Interference


In [4]:
collection_id = input("Enter the ID of the collection you want to explore: ")
print("Selected: ", collection_id)

Selected:  18


In [5]:
schemas = fetch_schemas(collection_id)
for schema in schemas:
    print(f"- {schema}")

schema = input("Enter the schema to use (or press Enter to skip): ")
print("Selected: ", schema)


Available schemas in collection:
- Image (64 occurrences)
- {'key': 'Image', 'count': 64}
Selected:  Image


In [6]:
entities, next_url = fetch_entities(collection_id, schema, limit=50)

In [7]:
save_entities_to_csv(entities, "entities_page.csv")

Saved 50 entities to entities_page.csv


### Build Relations

In [8]:
def extract_relations(entities):
    """Extract relationships from entities for network visualization."""
    relations = []
    
    for entity in entities:
        source = entity.get("id")  

        for person in entity["properties"].get("peopleMentioned", []):
            relations.append({"source": source, "target": person, "relationship": "mentioned"})

        for company in entity["properties"].get("companiesMentioned", []):
            relations.append({"source": source, "target": company, "relationship": "mentions company"})

        for email in entity["properties"].get("emailMentioned", []):
            relations.append({"source": source, "target": email, "relationship": "mentions email"})

        parents = entity["properties"].get("parent", [])
        if isinstance(parents, list):
            for parent in parents:
                if isinstance(parent, dict):
                    target_id = parent.get("id")
                    target_schema = parent.get("schema", "Unknown")
                    if target_id:
                        relations.append({
                            "source": source,
                            "target": target_id,
                            "relationship": f"is child of"
                        })

        ancestors = entity["properties"].get("ancestors", [])
        if isinstance(ancestors, list):
            for ancestor in ancestors:
                if isinstance(ancestor, dict):
                    target_id = ancestor.get("id")
                    target_schema = ancestor.get("schema", "Unknown")
                    if target_id:
                        relations.append({
                            "source": source,
                            "target": target_id,
                            "relationship": f"has ancestor"
                        })

        nested_target = entity.get("target")
        if nested_target:
            target_id = nested_target.get("id")
            target_schema = nested_target.get("schema")
            if target_id and target_schema:
                relations.append({
                    "source": source,
                    "target": target_id,
                    "relationship": f"linked to {target_schema}"
                })

    return pd.DataFrame(relations)


def save_relations_to_csv(relations, filename="relations.csv"):
    """Save extracted relationships to a CSV file."""
    relations.to_csv(filename, index=False)
    print(f"Saved {len(relations)} relations to {filename}")


In [9]:
relations = extract_relations(entities)

In [10]:
save_relations_to_csv(relations, "relations.csv")

Saved 203 relations to relations.csv
