# Exporting Structured Data from search.libraryofleaks.org

### Setup

In [8]:
import requests
import pandas as pd
import networkx as nx
import time



In [9]:
API_KEY = "API"
BASE_URL = "https://search.libraryofleaks.org/api/2"
HEADERS = {"Authorization": f"ApiKey {API_KEY}"}

In [10]:
def fetch_collections():
    """Fetch all available collections."""
    url = f"{BASE_URL}/collections"
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response.json()["results"]

def fetch_schemas(collection_id):
    """Fetch schemas available in a specific collection."""
    url = f"{BASE_URL}/collections/{collection_id}"
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    collection_data = response.json()
    schema_values = collection_data.get("statistics", {}).get("schema", {}).get("values", {})
    return list(schema_values.keys())

def fetch_all_entities_by_schemas(collection_id, limit=100, max_offset=9900):
    """
    Fetch all entities across all schemas for a collection.
    """
    all_entities = []
    schemas = fetch_schemas(collection_id)

    for schema in schemas:
        print(f"Fetching entities for schema: {schema}")
        schema_entities = fetch_entities_with_offset(collection_id, schema, limit=limit, max_offset=max_offset)
        all_entities.extend(schema_entities)

    return all_entities

def fetch_entities_with_offset(collection_id, schema, limit=100, max_offset=9900, retries=3):
    """Fetch entities with automatic retries on rate limit errors."""
    entities = []
    for offset in range(0, max_offset, limit):
        params = {
            "collection_id": collection_id,
            "schema": schema,
            "filter:schemata": schema,
            "limit": limit,
            "offset": offset,
        }

        url = f"{BASE_URL}/entities"
        attempt = 0
        while attempt < retries:
            response = requests.get(url, headers=HEADERS, params=params)
            
            if response.status_code == 429:
                print("Rate limit hit. Retrying in 30 seconds...")
                time.sleep(30)
                attempt += 1
                continue
            
            response.raise_for_status()
            data = response.json()
            entities.extend(data.get("results", []))
            print(f"Fetched {len(data.get('results', []))} entities from offset {offset}")
            break

        else:
            print(f"Failed after {retries} retries for offset {offset}. Skipping this range.")

        time.sleep(1)

    return entities


def fetch_entities_with_pagination(collection_id, schema, limit=100):
    """Fetch all entities for a specific collection and schema with pagination."""
    entities = []
    params = {"collection_id": collection_id, "schema": schema, "filter:schemata": schema, "limit": limit}
    url = f"{BASE_URL}/entities"

    while url:
        response = requests.get(url, headers=HEADERS, params=params if url == f"{BASE_URL}/entities" else None)
        response.raise_for_status()
        data = response.json()
        entities.extend(data.get("results", []))
        url = data.get("next")  

    return entities

### Query Data

In [11]:
collections = fetch_collections()
print("Available collections:")
for col in collections:
    print(f"- {col['id']}: {col['label']}")

Available collections:
- 50: Aban Offshore
- 13: Afghanistan Papiere
- 51: Agencia Nacional de Hidrocarburos
- 18: Airman Teixeira Leaks
- 53: Alliance Coal
- 35: Bahamas Registry
- 1: BlueLeaks
- 29: Chinga La Migra
- 14: Constellis
- 25: Cryptome Archive (2024)
- 43: DJC Accountants
- 15: Documents from US Espionage Den
- 54: ENAMI EP
- 39: Ethiopia Financial Intelligence Service
- 19: FBI’s Secret Rules
- 56: Forest
- 21: Fraternal Order of Police
- 31: Fuck FBI Friday
- 52: GorraLeaks
- 55: Gulf Copper


In [12]:
collection_id = input("Enter the ID of the collection you want to explore: ")
print("Selected: ", collection_id)

Selected:  33


In [13]:
schemas = fetch_schemas(collection_id)
print(schemas)

['Page', 'HyperText', 'Image', 'Pages', 'Table', 'Person', 'PlainText', 'Workbook', 'Email', 'Folder', 'Document', 'Event', 'Package', 'Video', 'Audio']


In [198]:
all_entities = fetch_all_entities_by_schemas(collection_id, max_offset=1000)

Fetching entities for schema: Page
Fetched 100 entities from offset 0
Fetched 100 entities from offset 100
Fetched 100 entities from offset 200
Fetched 100 entities from offset 300
Fetched 100 entities from offset 400
Fetched 100 entities from offset 500
Fetched 100 entities from offset 600
Fetched 100 entities from offset 700
Fetched 100 entities from offset 800
Fetched 100 entities from offset 900
Fetching entities for schema: HyperText
Fetched 100 entities from offset 0
Fetched 100 entities from offset 100
Fetched 100 entities from offset 200
Fetched 100 entities from offset 300
Fetched 100 entities from offset 400
Fetched 100 entities from offset 500
Fetched 100 entities from offset 600
Fetched 100 entities from offset 700
Fetched 100 entities from offset 800
Fetched 100 entities from offset 900
Fetching entities for schema: Image
Fetched 100 entities from offset 0
Fetched 100 entities from offset 100
Fetched 100 entities from offset 200
Fetched 100 entities from offset 300
Fetched

### Save Data

In [207]:
print(f"Total entities fetched: {len(all_entities)}")

Total entities fetched: 15000


In [200]:
def extract_metadata(entities, remove_orphans=True):
    """Extract nodes and relationships from entities, handling orphan nodes."""
    nodes = []
    relations = []
    node_ids = set() 

    for entity in entities:
        source = entity.get("id", "")
        schema = entity.get("schema", "")
        properties = entity.get("properties", {})

        if source not in node_ids:
            node_data = {
                "id": source,
                "schema": schema,
                "type": schema,  
                "created_at": entity.get("created_at", ""),
                "updated_at": entity.get("updated_at", ""),
            }
            nodes.append(node_data)
            node_ids.add(source)

        
        documents = properties.get("document", [])
        if isinstance(documents, list):
            for doc in documents:
                if isinstance(doc, dict):  
                    doc_id = doc.get("id", "")
                    doc_schema = doc.get("schema", "")
                    doc_properties = doc.get("properties", {})

                    
                    if doc_id not in node_ids:
                        doc_node_data = {
                            "id": doc_id,
                            "schema": doc_schema,
                            "type": doc_schema,  
                            "created_at": doc.get("created_at", ""),
                            "updated_at": doc.get("updated_at", ""),
                        }
                        nodes.append(doc_node_data)
                        node_ids.add(doc_id)

                    
                    for rel_key, rel_values in doc_properties.items():
                        if isinstance(rel_values, list):
                            for rel_value in rel_values:
                                if isinstance(rel_value, dict):
                                    target_id = rel_value.get("id", "")
                                    if target_id and target_id not in node_ids:
                                        
                                        target_node_data = {
                                            "id": target_id,
                                            "schema": rel_key,  
                                            "type": rel_key,  
                                        }
                                        nodes.append(target_node_data)
                                        node_ids.add(target_id)

                                    
                                    relations.append({
                                        "source": doc_id,
                                        "target": target_id,
                                        "relationship": f"has {rel_key}",
                                    })
                                else:
                                    target_id = str(rel_value)
                                    if target_id not in node_ids:
                                        
                                        target_node_data = {
                                            "id": target_id,
                                            "schema": rel_key,  
                                            "type": rel_key,  
                                        }
                                        nodes.append(target_node_data)
                                        node_ids.add(target_id)

                                    
                                    relations.append({
                                        "source": doc_id,
                                        "target": target_id,
                                        "relationship": f"has {rel_key}",
                                    })

    nodes_df = pd.DataFrame(nodes).fillna("")
    relations_df = pd.DataFrame(relations).fillna("")

    if remove_orphans:
        connected_nodes = set(relations_df["source"]).union(set(relations_df["target"]))
        nodes_df = nodes_df[nodes_df["id"].isin(connected_nodes)]

    return nodes_df, relations_df


In [208]:
nodes, relations = extract_metadata(all_entities)

In [209]:
def save_csv(nodes, relations, collection_id):
    """Save nodes and relations to CSV for Gephi."""
    nodes = nodes.fillna("")
    relations = relations.fillna("")

    nodes_file = f"collection_{collection_id}_nodes.csv"
    relations_file = f"collection_{collection_id}_relations.csv"
    nodes.to_csv(nodes_file, index=False)
    relations.to_csv(relations_file, index=False)
    print(f"Nodes saved to {nodes_file}")
    print(f"Relations saved to {relations_file}")

In [204]:
def save_graphml(relations, nodes, output_file="network.graphml"):
    for col in ["source", "target", "relationship", "source_type", "target_type"]:
        if col not in relations.columns:
            relations[col] = ""

    relations["source"] = relations["source"].apply(lambda x: str(x) if isinstance(x, dict) else x)
    relations["target"] = relations["target"].apply(lambda x: str(x) if isinstance(x, dict) else x)

    G = nx.DiGraph()

    for _, node in nodes.iterrows():
        node_id = node["id"]
  
        node_attributes = {
            "schema": node.get("schema", ""),  
            "fileName": node.get("fileName", ""),
            "fileSize": int(node["fileSize"]) if str(node.get("fileSize", "")).isdigit() else 0,
            "mimeType": node.get("mimeType", ""),
            "language": node.get("language", ""),
            "created_at": node.get("created_at", ""),  
            "updated_at": node.get("updated_at", ""),  
            "contentHash": node.get("contentHash", "")
        }
  
        G.add_node(node_id, **node_attributes)

    
    for _, row in relations.iterrows():
        source = row["source"]
        target = row["target"]
        relationship = row["relationship"]
        source_type = row["source_type"]
        target_type = row["target_type"]

        G.add_edge(source, target, relationship=relationship, source_type=source_type, target_type=target_type)

    try:
        nx.write_graphml(G, output_file)
        print(f"GraphML saved to {output_file}")
    except Exception as e:
        print(f"Failed to save GraphML: {e}")


In [210]:
save_graphml(relations, nodes, output_file=f"collection_{collection_id}.graphml")

GraphML saved to collection_33.graphml


In [206]:
save_csv(nodes, relations, collection_id)

Nodes saved to collection_33_nodes.csv
Relations saved to collection_33_relations.csv
