# Exporting Structured Data from search.libraryofleaks.org

### Setup

In [281]:
import requests
import pandas as pd
import networkx as nx
import time

In [None]:
API_KEY = "API"
BASE_URL = "https://search.libraryofleaks.org/api/2"
HEADERS = {} # {"Authorization": f"ApiKey {API_KEY}"}

In [297]:
def fetch_collections():
    """Fetch all collections across paginated results."""
    url = f"{BASE_URL}/collections"
    all_collections = []

    while url:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        all_collections.extend(data.get("results", []))
        url = data.get("next", None)

    return all_collections


def fetch_schemas(collection_id):
    """Fetch schemas available in a specific collection."""
    url = f"{BASE_URL}/collections/{collection_id}"
    response = requests.get(url)
    response.raise_for_status()
    collection_data = response.json()
    schema_values = collection_data.get("statistics", {}).get("schema", {}).get("values", {})
    return list(schema_values.keys())

def fetch_all_entities_by_schemas(collection_id, limit=100, max_offset=9900):
    """
    Fetch all entities across all schemas for a collection.
    """
    all_entities = []
    schemas = fetch_schemas(collection_id)

    for schema in schemas:
        print(f"Fetching entities for schema: {schema}")
        schema_entities = fetch_entities_with_offset(collection_id, schema, limit=limit, max_offset=max_offset)
        all_entities.extend(schema_entities)

    return all_entities

def fetch_entities_with_offset(collection_id, schema, limit=100, max_offset=9900, retries=5, backoff_factor=2):
    """Fetch entities with retries and exponential backoff for rate-limiting (429) errors."""
    entities = []
    for offset in range(0, max_offset, limit):
        params = {
            "collection_id": collection_id,
            "schema": schema,
            "filter:schemata": schema,
            "limit": limit,
            "offset": offset,
        }

        url = f"{BASE_URL}/entities"
        attempt = 0
        while attempt < retries:
            try:
                response = requests.get(url, params=params)
                response.raise_for_status() 
                data = response.json()
                entities.extend(data.get("results", []))
                print(f"Fetched {len(data.get('results', []))} entities from offset {offset}")
                break 
            except requests.exceptions.HTTPError as e:
                if response.status_code == 429:
                    retry_after = int(response.headers.get("Retry-After", backoff_factor ** attempt))
                    print(f"429 Too Many Requests: Retrying in {retry_after} seconds...")
                    time.sleep(retry_after)
                    attempt += 1
                else:
                    raise e  
            except Exception as e:
                print(f"Error: {e}. Retrying in {backoff_factor ** attempt} seconds...")
                time.sleep(backoff_factor ** attempt)
                attempt += 1
        else:
            print(f"Failed to fetch data after {retries} retries for offset {offset}. Skipping this range.")

    return entities




def fetch_entities_with_pagination(collection_id, schema, limit=100):
    """Fetch all entities for a specific collection and schema with pagination."""
    entities = []
    params = {"collection_id": collection_id, "schema": schema, "filter:schemata": schema, "limit": limit}
    url = f"{BASE_URL}/entities"

    while url:
        response = requests.get(url, params=params if url == f"{BASE_URL}/entities" else None)
        response.raise_for_status()
        data = response.json()
        entities.extend(data.get("results", []))
        url = data.get("next")  

    return entities

### Query Data

In [288]:
collections = fetch_collections()
sorted_collections = sorted(collections, key=lambda col: col['label'])

print("Available collections:")
for col in sorted_collections:
    print(f"- {col['id']}: {col['label']}")

Available collections:
- 50: Aban Offshore
- 13: Afghanistan Papiere
- 51: Agencia Nacional de Hidrocarburos
- 18: Airman Teixeira Leaks
- 53: Alliance Coal
- 35: Bahamas Registry
- 1: BlueLeaks
- 29: Chinga La Migra
- 14: Constellis
- 25: Cryptome Archive (2024)
- 43: DJC Accountants
- 15: Documents from US Espionage Den
- 54: ENAMI EP
- 39: Ethiopia Financial Intelligence Service
- 19: FBI’s Secret Rules
- 56: Forest
- 21: Fraternal Order of Police
- 31: Fuck FBI Friday
- 52: GorraLeaks
- 55: Gulf Copper
- 45: HBGary
- 9: Hillary Clinton emails
- 6: Hunter Biden emails
- 57: INAFOR
- 3: Israel Defense Forces (Anonymous For Justice)
- 33: Israel Ministry of Justice
- 4: Jones Day
- 42: Kallias and Associates
- 34: Kazakhstan Ministry of Energy
- 38: LAPD Headshots
- 40: LLC Capital
- 11: Metropolitan Police Department D.C.
- 16: MilicoLeaks
- 47: Nauru Police Force
- 41: Office of Industrial Economics, Thailand
- 10: Paramilitary Election Interference
- 23: Patron Papers
- 32: Shootin

In [298]:
collection_id = input("Enter the ID of the collection you want to explore: ")
print("Selected: ", collection_id)

Selected:  33


In [299]:
schemas = fetch_schemas(collection_id)
print(schemas)

HTTPError: 500 Server Error: Internal Server Error for url: https://search.libraryofleaks.org/api/2/collections/33

In [253]:
try:
    all_entities = fetch_all_entities_by_schemas(collection_id, max_offset=5000)
except requests.exceptions.HTTPError as e:
    print(f"HTTPError occurred: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Fetching entities for schema: Page
Fetched 100 entities from offset 0
Fetched 100 entities from offset 100
Fetched 100 entities from offset 200
Fetched 100 entities from offset 300
Fetched 100 entities from offset 400
Fetched 100 entities from offset 500
Fetched 100 entities from offset 600
Fetched 100 entities from offset 700
Fetched 100 entities from offset 800
Fetched 100 entities from offset 900
Fetched 100 entities from offset 1000
Fetched 100 entities from offset 1100
Fetched 100 entities from offset 1200
Fetched 100 entities from offset 1300
Fetched 100 entities from offset 1400
Fetched 100 entities from offset 1500
Fetched 100 entities from offset 1600
Fetched 100 entities from offset 1700
Fetched 100 entities from offset 1800
Fetched 100 entities from offset 1900
Fetched 100 entities from offset 2000
Fetched 100 entities from offset 2100
Fetched 100 entities from offset 2200
Fetched 100 entities from offset 2300
Fetched 100 entities from offset 2400
Fetched 100 entities from o

### Save Data

In [254]:
print(f"Total entities fetched: {len(all_entities)}")

Total entities fetched: 15000


In [255]:
def extract_metadata(entities, remove_orphans=True):
    """Extract nodes, relationships, and bodyText from entities."""
    nodes = []
    relations = []
    body_text_data = []  
    node_types = {}  

    for entity in entities:
        source = entity.get("id", "")
        schema = entity.get("schema", "")
        properties = entity.get("properties", {})

        
        if source not in node_types:
            node_data = {
                "id": source,
                "schema": schema,
                "type": schema,  
                "created_at": entity.get("created_at", ""),
                "updated_at": entity.get("updated_at", ""),
                "fileSize": properties.get("fileSize", [None])[0],  
                "fileName": properties.get("fileName", [None])[0],  
                "mimeType": properties.get("mimeType", [None])[0],  
            }
            nodes.append(node_data)
            node_types[source] = schema

        
        body_text = properties.get("bodyText", [])
        if isinstance(body_text, list) and body_text:
            body_text_data.append({
                "id": source,
                "bodyText": " ".join(body_text)
            })

        
        documents = properties.get("document", [])
        if isinstance(documents, list):
            for doc in documents:
                if isinstance(doc, dict):  
                    doc_id = doc.get("id", "")
                    doc_schema = doc.get("schema", "")
                    doc_properties = doc.get("properties", {})

                    
                    if doc_id not in node_types:
                        doc_node_data = {
                            "id": doc_id,
                            "schema": doc_schema,
                            "type": doc_schema,  
                            "created_at": doc.get("created_at", ""),
                            "updated_at": doc.get("updated_at", ""),
                            "fileSize": doc_properties.get("fileSize", [None])[0],
                            "fileName": doc_properties.get("fileName", [None])[0],
                            "mimeType": doc_properties.get("mimeType", [None])[0],
                        }
                        nodes.append(doc_node_data)
                        node_types[doc_id] = doc_schema

                    
                    nested_body_text = doc_properties.get("bodyText", [])
                    if isinstance(nested_body_text, list) and nested_body_text:
                        body_text_data.append({
                            "id": doc_id,
                            "bodyText": " ".join(nested_body_text)
                        })

                    
                    for rel_key, rel_values in doc_properties.items():
                        if isinstance(rel_values, list):
                            for rel_value in rel_values:
                                if isinstance(rel_value, dict):
                                    target_id = rel_value.get("id", "")
                                    target_type = rel_key  
                                    if target_id and target_id not in node_types:
                                        
                                        target_node_data = {
                                            "id": target_id,
                                            "schema": doc_schema,
                                            "type": target_type,
                                        }
                                        nodes.append(target_node_data)
                                        node_types[target_id] = target_type

                                    
                                    relations.append({
                                        "source": doc_id,
                                        "target": target_id,
                                        "relationship": f"has {rel_key}",
                                        "source_type": node_types.get(doc_id, ""),
                                        "target_type": node_types.get(target_id, ""),
                                    })
                                else:
                                    target_id = str(rel_value)
                                    target_type = rel_key  
                                    if target_id not in node_types:
                                        
                                        target_node_data = {
                                            "id": target_id,
                                            "schema": doc_schema,
                                            "type": target_type,
                                        }
                                        nodes.append(target_node_data)
                                        node_types[target_id] = target_type

                                    
                                    relations.append({
                                        "source": doc_id,
                                        "target": target_id,
                                        "relationship": f"has {rel_key}",
                                        "source_type": node_types.get(doc_id, ""),
                                        "target_type": node_types.get(target_id, ""),
                                    })

    nodes_df = pd.DataFrame(nodes).fillna("")
    relations_df = pd.DataFrame(relations).fillna("")
    body_text_df = pd.DataFrame(body_text_data).fillna("")

    if remove_orphans:
        
        connected_nodes = set(relations_df["source"]).union(set(relations_df["target"]))
        nodes_df = nodes_df[nodes_df["id"].isin(connected_nodes)]

    return nodes_df, relations_df, body_text_df

def save_body_text(body_text_df, output_file="body_text.csv"):
    """Save bodyText data to a separate file."""
    body_text_df.to_csv(output_file, index=False)
    print(f"BodyText data saved to {output_file}")

In [256]:
nodes, relations, body_text = extract_metadata(all_entities)

In [257]:
def save_csv(nodes, relations, collection_id):
    """Save nodes and relations to CSV for Gephi."""
    nodes = nodes.fillna("")
    relations = relations.fillna("")

    nodes_file = f"collection_{collection_id}_nodes.csv"
    relations_file = f"collection_{collection_id}_relations.csv"
    nodes.to_csv(nodes_file, index=False)
    relations.to_csv(relations_file, index=False)
    print(f"Nodes saved to {nodes_file}")
    print(f"Relations saved to {relations_file}")

### Save cleaned nodes

I'm adding specific filtering to ignore the processing information, language, etc that would link everything togetehr

In [258]:
def save_graph(nodes_df, relations_df, output_file="graph.graphml"):
    """Construct and save the graph in GraphML format."""
    G = nx.DiGraph()

    for _, node in nodes_df.iterrows():
        node_id = node["id"]
        node_attributes = node.drop("id").to_dict()
        G.add_node(node_id, **node_attributes)

    for _, edge in relations_df.iterrows():
        source = edge["source"]
        target = edge["target"]
        relationship = edge.get("relationship", "")
        G.add_edge(source, target, relationship=relationship)

    nx.write_graphml(G, output_file)
    print(f"Graph saved to {output_file}")


In [259]:
unwanted_types = [
    'detectedLanguage',
    'mimeType',
    # 'ancestors', 
    'processingAgent',
    'processingStatus',
    'processedAt'
]

nodes["fileSize"] = pd.to_numeric(nodes["fileSize"], errors="coerce").fillna(0).astype(int)

nodes_cleaned = nodes[~nodes['type'].isin(unwanted_types)]
nodes_cleaned.to_csv(f"data/{collection_id}_cleaned_nodes.csv", index=False)

relations_cleaned = relations[~relations['target_type'].isin(unwanted_types)]
relations_cleaned.to_csv(f"data/{collection_id}_cleaned_relations.csv", index=False)

save_graph(nodes_cleaned, relations_cleaned, output_file=f"data/{collection_id}_cleaned_graph.graphml")



Graph saved to data/9_cleaned_graph.graphml


In [261]:
save_body_text(body_text, output_file=f"data/{collection_id}_body_text.csv")

BodyText data saved to data/9_body_text.csv


---
### Save all nodes

In [228]:
def save_graphml(relations, nodes, output_file="network.graphml"):
    for col in ["source", "target", "relationship", "source_type", "target_type"]:
        if col not in relations.columns:
            relations[col] = ""

    relations["source"] = relations["source"].apply(lambda x: str(x) if isinstance(x, dict) else x)
    relations["target"] = relations["target"].apply(lambda x: str(x) if isinstance(x, dict) else x)

    G = nx.DiGraph()

    for _, node in nodes.iterrows():
        node_id = node["id"]
  
        node_attributes = {
            "schema": node.get("schema", ""),  
            "fileName": node.get("fileName", ""),
            "fileSize": int(node["fileSize"]) if str(node.get("fileSize", "")).isdigit() else 0,
            "mimeType": node.get("mimeType", ""),
            "language": node.get("language", ""),
            "created_at": node.get("created_at", ""),  
            "updated_at": node.get("updated_at", ""),  
            "contentHash": node.get("contentHash", "")
        }
  
        G.add_node(node_id, **node_attributes)

    
    for _, row in relations.iterrows():
        source = row["source"]
        target = row["target"]
        relationship = row["relationship"]
        source_type = row["source_type"]
        target_type = row["target_type"]

        G.add_edge(source, target, relationship=relationship, source_type=source_type, target_type=target_type)

    try:
        nx.write_graphml(G, output_file)
        print(f"GraphML saved to {output_file}")
    except Exception as e:
        print(f"Failed to save GraphML: {e}")


In [229]:
save_graphml(relations, nodes, output_file=f"data/collection_{collection_id}.graphml")

GraphML saved to collection_33.graphml


In [230]:
save_csv(nodes, relations, collection_id)

Nodes saved to collection_33_nodes.csv
Relations saved to collection_33_relations.csv


In [231]:
save_body_text(body_text, output_file=f"data/body_text_{collection_id}.csv")

BodyText data saved to body_text_33.csv


## Topics analysis

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd

body_texts = body_text['bodyText']

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(body_texts)

num_topics = 5  
nmf_model = NMF(n_components=num_topics, random_state=42)
topic_matrix = nmf_model.fit_transform(tfidf_matrix)

feature_names = vectorizer.get_feature_names_out()
topics = []

for topic_idx, topic in enumerate(nmf_model.components_):
    top_keywords = [feature_names[i] for i in topic.argsort()[:-11:-1]] 
    topics.append(" ".join(top_keywords)) 

document_topics = topic_matrix.argmax(axis=1) 
body_text['topic'] = [topics[topic_idx] for topic_idx in document_topics]

body_text.to_csv("body_text_with_topics.csv", index=False)
print("Topics extracted and saved to 'body_text_with_topics.csv'")


Topics extracted and saved to 'body_text_with_topics.csv'
