In [1]:
import sqlite3
import json

In [3]:
# Define the database file
db_file = "all_collections.db"

# Connect to the SQLite database (creates the file if it doesn't exist)
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Create the table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS all_collections (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        collection_id TEXT UNIQUE,
        marketplace TEXT NOT NULL,
        slug_name TEXT NOT NULL,
        full_name TEXT NOT NULL,
        description TEXT,
        category TEXT,
        token_standard TEXT,       
        created_time TEXT,
        image_url TEXT,
        project_url TEXT,
        twitter_url TEXT,
        instagram_url TEXT,
        facebook_url TEXT,
        discord_url TEXT,
        telegram_url TEXT,
        marketplace_fee REAL,
        royalty_fee REAL
    );
""")

# Commit and close the connection
conn.commit()
conn.close()

Input Rarible

In [4]:
# File paths
source_db = "collections_rarible.db"
target_db = "all_collections.db"

# Connect to the source database
source_conn = sqlite3.connect(source_db)
source_cursor = source_conn.cursor()

# Fetch data from collections_rarible.db
source_cursor.execute("SELECT collection_id, name, meta_name, meta_description, collection_type FROM rarible_collections")
rows = source_cursor.fetchall()

# Process and insert data into all_collections.db
target_conn = sqlite3.connect(target_db)
target_cursor = target_conn.cursor()

# Insert transformed data
for idx, (collection_id, name, meta_name, meta_description, collection_type) in enumerate(rows, start=1):
    # Restructure collection_id
    if ":" in collection_id:
        blockchain, contract_info = collection_id.split(":", 1)
        collection_id = f"{contract_info}:{blockchain.lower()}"
    
    # Insert data into all_collections.db
    target_cursor.execute("""
        INSERT OR IGNORE INTO all_collections (
            id, collection_id, marketplace, slug_name, full_name, description, category, 
            token_standard, created_time, image_url, project_url, twitter_url, instagram_url, 
            facebook_url, discord_url, telegram_url, marketplace_fee, royalty_fee
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, (idx, collection_id, "Rarible", name, meta_name, meta_description, "", 
          collection_type, "", "", "", "", "", "", "", "", None, None))

# Commit and close connections
target_conn.commit()
target_conn.close()
source_conn.close()

print("Data successfully transferred from collections_rarible.db to all_collections.db.")


Data successfully transferred from collections_rarible.db to all_collections.db.


Input OpenSea

In [5]:
# File paths
opensea_db = "collections_opensea.db"
target_db = "all_collections.db"

# Connect to the all_collections database
target_conn = sqlite3.connect(target_db)
target_cursor = target_conn.cursor()

# Load existing collection IDs from all_collections for duplicate check
target_cursor.execute("SELECT id, collection_id, category FROM all_collections")
existing_collections = {row[1]: (row[0], row[2]) for row in target_cursor.fetchall()}  # {collection_id: (id, category)}

# Get the current max ID to continue numbering
target_cursor.execute("SELECT MAX(id) FROM all_collections")
max_id = target_cursor.fetchone()[0] or 0  # If table is empty, start from 1

# Connect to the OpenSea database
opensea_conn = sqlite3.connect(opensea_db)
opensea_cursor = opensea_conn.cursor()

# Fetch data from collections_opensea.db, including category
opensea_cursor.execute("""
    SELECT contract_info, collection_slug, name, description, category, image_url, project_url, 
           twitter_username, instagram_username, discord_url, telegram_url 
    FROM collections_opensea_0803
""")
rows = opensea_cursor.fetchall()

# Process and insert/update data
for contract_info, collection_slug, name, description, category, image_url, project_url, twitter, instagram, discord, telegram in rows:
    
    collection_id = contract_info  # collection_id matches contract_info exactly

    if collection_id in existing_collections:
        existing_id, existing_category = existing_collections[collection_id]

        # Only update category if it is currently empty
        category_to_update = category if not existing_category else existing_category

        update_query = """
            UPDATE all_collections
            SET category = ?,
                image_url = COALESCE(NULLIF(?, ''), image_url),
                project_url = COALESCE(NULLIF(?, ''), project_url),
                twitter_url = COALESCE(NULLIF(?, ''), twitter_url),
                instagram_url = COALESCE(NULLIF(?, ''), instagram_url),
                facebook_url = COALESCE(NULLIF(?, ''), facebook_url),
                discord_url = COALESCE(NULLIF(?, ''), discord_url),
                telegram_url = COALESCE(NULLIF(?, ''), telegram_url)
            WHERE collection_id = ?
        """
        target_cursor.execute(update_query, (category_to_update, image_url, project_url, twitter, instagram, "", discord, telegram, collection_id))

    else:
        # **Re-check for duplicates before inserting**
        target_cursor.execute("SELECT id FROM all_collections WHERE collection_id = ?", (collection_id,))
        existing_entry = target_cursor.fetchone()

        if existing_entry is None:  # Ensure no duplicate is inserted
            max_id += 1
            insert_query = """
                INSERT INTO all_collections (
                    id, collection_id, marketplace, slug_name, full_name, description, category, 
                    token_standard, created_time, image_url, project_url, twitter_url, instagram_url, 
                    facebook_url, discord_url, telegram_url, marketplace_fee, royalty_fee
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """
            target_cursor.execute(insert_query, (
                max_id, collection_id, "OpenSea", collection_slug, name, description, category, 
                "", "", image_url, project_url, twitter, instagram, "", discord, telegram, None, None
            ))

# Commit and close connections
target_conn.commit()
target_conn.close()
opensea_conn.close()

print("Merge from collections_opensea.db to all_collections.db completed successfully.")

Merge from collections_opensea.db to all_collections.db completed successfully.


Input OpenSea top collections

In [7]:
# File paths
top_opensea_db = "top_collections_opensea.db"
target_db = "all_collections.db"

# Connect to the all_collections database
target_conn = sqlite3.connect(target_db)
target_cursor = target_conn.cursor()

# Load existing collection IDs from all_collections for duplicate check
target_cursor.execute("SELECT id, collection_id FROM all_collections")
existing_collections = {row[1]: row[0] for row in target_cursor.fetchall()}  # {collection_id: id}

# Get the current max ID to continue numbering
target_cursor.execute("SELECT MAX(id) FROM all_collections")
max_id = target_cursor.fetchone()[0] or 0  # If table is empty, start from 1

# Connect to the top OpenSea database
top_conn = sqlite3.connect(top_opensea_db)
top_cursor = top_conn.cursor()

# Fetch data from top_collections_opensea.db
top_cursor.execute("""
    SELECT contracts_address, contracts_chain, collection_slug, name, description, category, 
           created_date, project_url, twitter_username, instagram_username, 
           discord_url, telegram_url, opensea_fee, royalty 
    FROM collections
""")
rows = top_cursor.fetchall()

# Process and insert/update data
for contracts_address, contracts_chain, collection_slug, name, description, category, created_time, project_url, twitter, instagram, discord, telegram, marketplace_fee, royalty_fee in rows:
    
    # Create collection_id by combining contracts_address and contracts_chain
    collection_id = f"{contracts_address}:{contracts_chain}"

    # **Check for duplicates before inserting**
    target_cursor.execute("SELECT id FROM all_collections WHERE collection_id = ?", (collection_id,))
    existing_entry = target_cursor.fetchone()

    if existing_entry:
        # If collection_id exists, overwrite the entire row
        existing_id = existing_entry[0]

        update_query = """
            UPDATE all_collections
            SET marketplace = ?, slug_name = ?, full_name = ?, description = ?, category = ?, 
                token_standard = ?, created_time = ?, image_url = ?, project_url = ?, twitter_url = ?, 
                instagram_url = ?, facebook_url = ?, discord_url = ?, telegram_url = ?, 
                marketplace_fee = ?, royalty_fee = ?
            WHERE collection_id = ?
        """
        target_cursor.execute(update_query, (
            "OpenSea", collection_slug, name, description, category, 
            "", created_time, "", project_url, twitter, instagram, "", discord, telegram, 
            marketplace_fee, royalty_fee, collection_id
        ))

    else:
        # Insert as a new row
        max_id += 1
        insert_query = """
            INSERT INTO all_collections (
                id, collection_id, marketplace, slug_name, full_name, description, category, 
                token_standard, created_time, image_url, project_url, twitter_url, instagram_url, 
                facebook_url, discord_url, telegram_url, marketplace_fee, royalty_fee
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """
        target_cursor.execute(insert_query, (
            max_id, collection_id, "OpenSea", collection_slug, name, description, category, 
            "", created_time, "", project_url, twitter, instagram, "", discord, telegram, marketplace_fee, royalty_fee
        ))

# Commit and close connections
target_conn.commit()
target_conn.close()
top_conn.close()

print("Merge from top_collections_opensea.db to all_collections.db completed successfully.")

Merge from top_collections_opensea.db to all_collections.db completed successfully.


Input MagicEden

In [8]:
# File paths
magiceden_db = "collections_magiceden.db"
target_db = "all_collections.db"

# Connect to the all_collections database
target_conn = sqlite3.connect(target_db)
target_cursor = target_conn.cursor()

# Get the current max ID to continue numbering
target_cursor.execute("SELECT MAX(id) FROM all_collections")
max_id = target_cursor.fetchone()[0] or 0  # If table is empty, start from 1

# Load existing slug_names from all_collections to avoid duplicates
target_cursor.execute("SELECT slug_name FROM all_collections")
existing_slugs = {row[0] for row in target_cursor.fetchall()}  # Set of existing slug_names

# Connect to the MagicEden database
magiceden_conn = sqlite3.connect(magiceden_db)
magiceden_cursor = magiceden_conn.cursor()

# Fetch data from magiceden_collections
magiceden_cursor.execute("""
    SELECT symbol, name, description, categories, image, website, twitter, discord 
    FROM magiceden_collections
""")
rows = magiceden_cursor.fetchall()

# Process and insert/update data
for symbol, name, description, categories, image, website, twitter, discord in rows:
    
    if symbol in existing_slugs:
        print(f"Skipping duplicate entry for slug_name: {symbol}")  # Log if a duplicate is found
        continue  # Skip if slug_name already exists

    max_id += 1  # Increment ID for new entry

    insert_query = """
        INSERT INTO all_collections (
            id, collection_id, marketplace, slug_name, full_name, description, category, 
            token_standard, created_time, image_url, project_url, twitter_url, instagram_url, 
            facebook_url, discord_url, telegram_url, marketplace_fee, royalty_fee
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """
    target_cursor.execute(insert_query, (
        max_id, None, "MagicEden", symbol, name, description, categories, 
        "", "", image, website, twitter, "", "", discord, "", None, None
    ))

# Commit and close connections
target_conn.commit()
target_conn.close()
magiceden_conn.close()

print("Merge from collections_magiceden.db to all_collections.db completed successfully.")

Skipping duplicate entry for slug_name: sandwatch
Skipping duplicate entry for slug_name: motivated
Skipping duplicate entry for slug_name: lazzzy
Skipping duplicate entry for slug_name: seasons
Skipping duplicate entry for slug_name: inkstorm
Skipping duplicate entry for slug_name: geometric
Skipping duplicate entry for slug_name: kumis
Skipping duplicate entry for slug_name: owls
Skipping duplicate entry for slug_name: gb
Skipping duplicate entry for slug_name: moli
Skipping duplicate entry for slug_name: asr
Skipping duplicate entry for slug_name: notes
Skipping duplicate entry for slug_name: 01
Skipping duplicate entry for slug_name: snackiverse
Skipping duplicate entry for slug_name: turbokids
Skipping duplicate entry for slug_name: beramilio
Skipping duplicate entry for slug_name: moods
Skipping duplicate entry for slug_name: eliza
Skipping duplicate entry for slug_name: tetra
Skipping duplicate entry for slug_name: pieces
Skipping duplicate entry for slug_name: dana
Skipping dup

Input Atomic

In [9]:
# File paths
atomic_db = "collections_atomic.db"
target_db = "all_collections.db"

# Connect to the all_collections database
target_conn = sqlite3.connect(target_db)
target_cursor = target_conn.cursor()

# Get the current max ID to continue numbering
target_cursor.execute("SELECT MAX(id) FROM all_collections")
max_id = target_cursor.fetchone()[0] or 0  # If table is empty, start from 1

# Connect to the Atomic database
atomic_conn = sqlite3.connect(atomic_db)
atomic_cursor = atomic_conn.cursor()

# Fetch data from collections_atomic.db
atomic_cursor.execute("""
    SELECT collection_name, name, created_at_time, market_fee, data_json 
    FROM atomic_collections
""")
rows = atomic_cursor.fetchall()

# Process and insert/update data
for collection_name, name, created_at_time, market_fee, data_json in rows:
    
    # Ensure full_name is not NULL by falling back to collection_name
    full_name = name if name and name.strip() else collection_name  # Use collection_name if name is empty

    # Default values
    project_url = None
    twitter_url = None
    instagram_url = None
    facebook_url = None
    discord_url = None
    telegram_url = None

    # Parse JSON data if available
    if data_json:
        try:
            json_data = json.loads(data_json)

            # Extract project_url
            project_url = json_data.get("url")

            # Extract socials data
            socials = json.loads(json_data.get("socials", "{}"))  # Convert string to dictionary
            twitter_url = socials.get("twitter")
            instagram_url = socials.get("instagram")
            facebook_url = socials.get("facebook")
            discord_url = socials.get("discord")
            telegram_url = socials.get("telegram")

        except json.JSONDecodeError:
            print(f"Warning: Failed to parse JSON for collection {collection_name}")

    # Increment ID for new entry
    max_id += 1  

    insert_query = """
        INSERT INTO all_collections (
            id, collection_id, marketplace, slug_name, full_name, description, category, 
            token_standard, created_time, image_url, project_url, twitter_url, instagram_url, 
            facebook_url, discord_url, telegram_url, marketplace_fee, royalty_fee
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """
    target_cursor.execute(insert_query, (
        max_id, None, "Atomic", collection_name, full_name, "", "", 
        "", created_at_time, "", project_url, twitter_url, instagram_url, facebook_url, discord_url, telegram_url, market_fee, None
    ))

# Commit and close connections
target_conn.commit()
target_conn.close()
atomic_conn.close()

print("Merge from collections_atomic.db to all_collections.db completed successfully.")

Merge from collections_atomic.db to all_collections.db completed successfully.


Data Analysis

In [1]:
import pandas as pd

def descriptive_stats(db_path="all_collections.db"):
    conn = sqlite3.connect(db_path)

    # Read entire table into a DataFrame (note: watch out for big memory usage if 5.5M rows are large)
    df = pd.read_sql("SELECT * FROM all_collections", conn)
    conn.close()

    print("=== Basic Info ===")
    print(df.info())  # column types, etc.

    print("\n=== Marketplace Counts ===")
    print(df['marketplace'].value_counts(dropna=False))

    # If 'category' is stored in a single column
    print("\n=== Category Counts (top 20) ===")
    print(df['category'].value_counts(dropna=True).head(20))

    print("\n=== Distinct Chains ===")
    # If chain is included within 'collection_id' or 'token_standard', we parse them.
    # For instance, if chain is appended like '0x123abc:ethereum', we can split on ':'.
    if 'collection_id' in df.columns:
        df['chain'] = df['collection_id'].apply(lambda x: x.split(':')[-1] if ':' in str(x) else 'unknown')
        print(df['chain'].value_counts())

    print("\n=== Token Standard Counts (top 20) ===")
    print(df['token_standard'].value_counts(dropna=True).head(20))

In [4]:
if __name__ == "__main__":
    descriptive_stats("all_collections.db")

=== Basic Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5547023 entries, 0 to 5547022
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   collection_id    object 
 2   marketplace      object 
 3   slug_name        object 
 4   full_name        object 
 5   description      object 
 6   category         object 
 7   token_standard   object 
 8   created_time     object 
 9   image_url        object 
 10  project_url      object 
 11  twitter_url      object 
 12  instagram_url    object 
 13  facebook_url     object 
 14  discord_url      object 
 15  telegram_url     object 
 16  marketplace_fee  float64
 17  royalty_fee      float64
dtypes: float64(2), int64(1), object(15)
memory usage: 761.8+ MB
None

=== Marketplace Counts ===
marketplace
Rarible      4201341
OpenSea      1207071
Atomic        110142
MagicEden      28469
Name: count, dtype: int64

=== Category Counts (top 20) ===
category
     

In [2]:
from collections import Counter
import pandas as pd

def descriptive_category_counts(db_path="all_collections.db"):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("SELECT category FROM all_collections", conn)
    conn.close()

    category_counter = Counter()

    for cat_val in df['category'].dropna():
        cat_val = cat_val.strip()
        try:
            # Attempt to parse list-like strings (e.g., '["art", "pfps"]')
            if cat_val.startswith("[") and cat_val.endswith("]"):
                parsed = json.loads(cat_val.replace("null", "null"))
                categories = [c for c in parsed if c]  # Remove None/null
            else:
                categories = [cat_val]  # Single category string
        except Exception:
            categories = [cat_val]  # Fallback if parsing fails

        for cat in categories:
            if cat:
                category_counter[cat.strip().lower()] += 1

    # Convert Counter to DataFrame for clean display
    cat_df = pd.DataFrame(category_counter.items(), columns=["Category", "Collection Count"])
    cat_df = cat_df.sort_values(by="Collection Count", ascending=False).reset_index(drop=True)

    print("\n=== Aggregated Category Counts ===")
    print(cat_df)

    return cat_df

# Run function
descriptive_category_counts()


=== Aggregated Category Counts ===
               Category  Collection Count
0                   art             54813
1                  pfps             32175
2           photography              5259
3           memberships              3831
4                gaming              3415
5                 games              3086
6        virtual-worlds              2773
7        virtual_worlds              2055
8                 music              1137
9          domain-names               585
10  sports-collectibles               500
11               sports               254
12            launchpad               121
13         unclassified                46
14                  pfp                 5


Unnamed: 0,Category,Collection Count
0,art,54813
1,pfps,32175
2,photography,5259
3,memberships,3831
4,gaming,3415
5,games,3086
6,virtual-worlds,2773
7,virtual_worlds,2055
8,music,1137
9,domain-names,585
