In [10]:
import pandas as pd
from IPython.display import display

df = pd.read_csv("../data/metazoa_nodes.csv", dtype={"ott_id": str, "parent_ott_id": str})

print("Total rows:", len(df))
display(df.head())

# Named nodes missing OTT IDs
named = df[df['name'].notnull() & (df['name'] != '')]
missing_ott = named[named['ott_id'].isna()]
print("Named nodes without ott_id:", len(missing_ott))
if len(missing_ott) > 0:
    display(missing_ott.sample(min(5, len(missing_ott))))


# Non-root nodes missing parent IDs
nonroot = df[df['ott_id'].notna() & (df['ott_id'] != "691846")]
missing_parent = nonroot[nonroot['parent_ott_id'].isna()]
print("Non-root nodes without parent_ott_id:", len(missing_parent))
if len(missing_parent) > 0:
    display(missing_parent.sample(min(5, len(missing_parent))))


# Invalid parent IDs
valid_ids = set(df['ott_id'].dropna())
bad_refs = df[df['parent_ott_id'].notna() & ~df['parent_ott_id'].isin(valid_ids)]
print("Invalid parent_ott_id references:", len(bad_refs))
if len(bad_refs) > 0:
    display(bad_refs.sample(min(5, len(bad_refs))))


print("\nTaxonomic rank distribution:")
df['rank_depth'] = df['name'].str.count(' ')
depth_counts = df['rank_depth'].value_counts().sort_index()
display(depth_counts)


Total rows: 2870658


Unnamed: 0,ott_id,name,parent_ott_id
0,691846,Metazoa,
1,67819,Porifera,691846.0
2,4939695,Clathrinidae environmental samples,67819.0
3,5260763,unclassified Clathrinidae,67819.0
4,5261574,Clathrinidae sp. BC12,67819.0


Named nodes without ott_id: 0
Non-root nodes without parent_ott_id: 0
Invalid parent_ott_id references: 0

Taxonomic rank distribution:


rank_depth
0      259177
1     1907433
2      603566
3       69287
4       23795
5        5478
6        1419
7         400
8          62
9          19
10         11
11          4
12          6
13          1
Name: count, dtype: int64

In [7]:
import sqlite3
import pandas as pd
from IPython.display import display

# Connect to the enriched metadata database
db_path = '../data/metazoa.db'
conn = sqlite3.connect(db_path)

# Load nodes and metadata tables
nodes_df = pd.read_sql_query('SELECT ott_id, name, parent_ott_id FROM nodes', conn, dtype={'ott_id': str, 'parent_ott_id': str})
metadata_df = pd.read_sql_query('SELECT * FROM metadata', conn, dtype={'ott_id': str})

# Merge into a single dataframe
combined = nodes_df.merge(metadata_df, on='ott_id', how='inner')  # only rows with metadata

# Preview counts
print(f"Nodes with metadata: {len(combined)}/{len(nodes_df)} ({len(combined)/len(nodes_df):.2%})")

# Display a sample with inline images in a single HTML table
from IPython.display import HTML

def make_image_html(url):
    return f'<img src="{url}" width="100"></img>' if pd.notna(url) else ''

# Prepare sample subset
sample_df = combined.sample(min(10, len(combined))).copy()
# Create an HTML column for the images
sample_df['image'] = sample_df['image_url'].apply(make_image_html)
# Reorder columns for display
display_columns = ['ott_id', 'name', 'common_name', 'description', 'image']
# Render as HTML
display(HTML(sample_df[display_columns].to_html(escape=False, index=False)))

conn.close()


Nodes with metadata: 44/2870658 (0.00%)


ott_id,name,common_name,description,image
3655279,Ophiopthalmus,Ophiopthalmus,genus of echinoderms,
7802569,Oedichirus procerus,Oedichirus procerus,species of beetle,
3058699,Peridaedala triangulosa,Peridaedala triangulosa,species of insect,
1047152,Stylochaeta scirtetica,Stylochaeta scirtetica,species of gastrotrichs,
5025975,Oxylipeurus tetraonis,Oxylipeurus tetraonis,species of insect,
3552560,Sanogasta rufithorax,Sanogasta rufithorax,species of arachnid,
3678150,Zignisis alternata,Zignisis alternata,species of cnidarian,
7965841,Limicolaria cailliaudi,Limicolaria cailliauda,species of gastropod,
418328,Xenodon neuwiedii,Xenodon neuwiedii,species of reptile,
4393300,Phytoliriomyza clara,Phytoliriomyza clara,species of insect,


TypeError: 'NoneType' object is not callable