In [1]:
import sqlite3
import pandas as pd

In [2]:
df = pd.read_excel('decoded_final.xlsx')

In [3]:
df.head(2)

Unnamed: 0,id,slug,Ethereum Mainnet,Solana Mainnet,Polygon Sidechain,Base Layer 2,Arbitrum Layer 2,Others,ERC-721,ERC-1155,...,Multiple 1:n,Price Appreciation,Staking,Borrowing,Rights,Social Status,Sales Price,Royalties,Loyalty,if_royalty:fee
0,1,zora-posts-21385,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,1,0,0.05
1,2,zora-posts-19156,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,1,0,0.05


In [4]:
# Drop Binary columns for clustering to reduce dimensionality
columns_to_drop = [
    "Other ERC standards", 
    "Top to bottom", 
    "Bottom to top",
    "Bridgeable",
    "Non-Bridgeable",
    "Real Estate",
    "Unlimited",
    "Limited",
    "Digital",
    "Fractional n:1",
    "Borrowing"
]

df_reduced = df.drop(columns=columns_to_drop, errors="ignore")

In [5]:
df_reduced.head(2)

Unnamed: 0,id,slug,Ethereum Mainnet,Solana Mainnet,Polygon Sidechain,Base Layer 2,Arbitrum Layer 2,Others,ERC-721,ERC-1155,...,Single 1:1,Multiple 1:n,Price Appreciation,Staking,Rights,Social Status,Sales Price,Royalties,Loyalty,if_royalty:fee
0,1,zora-posts-21385,0,0,0,0,0,1,0,1,...,1,0,1,0,0,0,1,1,0,0.05
1,2,zora-posts-19156,0,0,0,0,0,1,0,1,...,1,0,1,0,0,0,1,1,0,0.05


Determine number of clusters

In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from math import comb
import warnings

In [7]:
def c_index_precomputed(dist_matrix, labels):
    """
    C-index (Hubert & Levin, 1976) using a precomputed distance matrix.
    dist_matrix: a 2D np.array of shape (n_samples, n_samples)
    labels: cluster labels for each row in dist_matrix
    Returns a float, lower = better.
    """
    n_samples = dist_matrix.shape[0]
    
    # Distances for pairs in the same cluster
    same_cluster_dists = []
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            if labels[i] == labels[j]:
                same_cluster_dists.append(dist_matrix[i, j])
    
    S = np.sum(same_cluster_dists)
    m = len(same_cluster_dists)
    if m == 0:
        return np.nan  # can't compute if all items in different clusters
    
    # All pairwise distances
    all_dists = []
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            all_dists.append(dist_matrix[i,j])
    all_dists_sorted = np.sort(all_dists)
    
    S_min = np.sum(all_dists_sorted[:m])
    S_max = np.sum(all_dists_sorted[-m:])
    
    c_idx = (S - S_min) / (S_max - S_min + 1e-12)
    return c_idx


def point_biserial_score_precomputed(dist_matrix, labels):
    """
    Point Biserial Correlation (Milligan & Cooper, 1985) with a precomputed distance matrix.
    Higher = better (better separation).
    """
    n_samples = dist_matrix.shape[0]
    
    same_dists = []
    diff_dists = []
    
    # Partition distances by same-cluster vs different-cluster
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            if labels[i] == labels[j]:
                same_dists.append(dist_matrix[i, j])
            else:
                diff_dists.append(dist_matrix[i, j])

    n_w = len(same_dists)  # number of within-cluster pairs
    n_b = len(diff_dists)  # number of between-cluster pairs
    if n_w == 0 or n_b == 0:
        return np.nan

    d_w = np.mean(same_dists)
    d_b = np.mean(diff_dists)

    all_dists = same_dists + diff_dists
    s = np.std(all_dists, ddof=1)

    # formula variant:
    # r = ((d_b - d_w)/s) * sqrt( n_w * n_b ) / comb(n_samples,2)
    # Some references adjust the sqrt(...) factor, but the main idea is that
    # higher r indicates better separation.
    numerator = (d_b - d_w)
    factor = np.sqrt((n_w * n_b) / comb(n_samples,2)**2)
    r = (numerator/(s + 1e-12)) * factor
    
    return r


def evaluate_clusters_jaccard(df, k_range=range(2, 11)):
    """
    1. Computes a Jaccard distance matrix for the DataFrame (all numeric/binary columns).
    2. For each k in k_range, runs AgglomerativeClustering with 'complete' linkage
       and 'precomputed' affinity.
    3. Computes c-index and point biserial correlation on the resulting labels.
    4. Returns a DataFrame of results.
    """
    # Build the Jaccard distance matrix
    X = df.values
    dist_matrix = pairwise_distances(X, metric='jaccard')
    n_samples = len(df)
    
    results = []
    for k in k_range:
        # We can't use 'ward' with precomputed distance => use 'complete' (or 'average')
        clusterer = AgglomerativeClustering(
            n_clusters=k,
            affinity='precomputed',
            linkage='complete'
        )
        labels = clusterer.fit_predict(dist_matrix)

        # Evaluate c-index, point biserial
        c_idx = c_index_precomputed(dist_matrix, labels)
        pb = point_biserial_score_precomputed(dist_matrix, labels)

        results.append({
            'k': k,
            'c_index': c_idx,            # lower = better
            'point_biserial': pb         # higher = better
        })
    return pd.DataFrame(results)

In [8]:
if __name__ == "__main__":
    
    df_reduced

    columns_to_exclude = ["id", "slug", "if_royalty:fee"]
    df_clust = df_reduced.drop(columns=columns_to_exclude, errors="ignore")

    # 3. Evaluate clusters with Jaccard + complete linkage
    eval_df = evaluate_clusters_jaccard(df_clust, k_range=range(2, 11))
    print(eval_df)



    k   c_index  point_biserial
0   2  0.365265        0.167452
1   3  0.245171        0.415023
2   4  0.187846        0.512578
3   5  0.186506        0.518100
4   6  0.196151        0.491088
5   7  0.193513        0.494465
6   8  0.203006        0.478998
7   9  0.195023        0.480971
8  10  0.189838        0.483606




Clustering

In [9]:
def cluster_into_5_jaccard_bool(df, columns_to_exclude=None):
    """
    Clusters the given DataFrame (with 0/1 columns) into 6 clusters
    using Jaccard distance + complete linkage. Ensures columns are bool.
    """
    if columns_to_exclude is None:
        columns_to_exclude = ["id", "slug", "if_royalty:fee"]

    # 1) Drop non-feature columns
    df_clust = df.drop(columns=columns_to_exclude, errors="ignore").copy()

    # 2) Convert 0/1 to bool (True/False)
    df_clust = df_clust.apply(pd.to_numeric, errors='coerce')  # handle any str
    df_clust = (df_clust == 1)  # 1->True, 0->False

    # 3) Convert DataFrame to a NumPy array of bools
    X = df_clust.values  # shape (n_samples, n_features), dtype=bool

    # 4) Compute Jaccard distance
    dist_matrix = pairwise_distances(X, metric="jaccard")

    # 5) Cluster with precomputed distance
    clusterer = AgglomerativeClustering(
        n_clusters=5,
        affinity='precomputed',
        linkage='complete'
    )
    labels = clusterer.fit_predict(dist_matrix)

    # 6) Append labels to the original DataFrame
    df_result = df.copy()
    df_result["cluster_label"] = labels

    return df_result

In [10]:
if __name__ == "__main__":
    columns_to_exclude = ["id", "slug", "if_royalty:fee"]
    df_clustered = cluster_into_5_jaccard_bool(df_reduced, columns_to_exclude=columns_to_exclude)

    df_clustered.to_excel("clustered_6.xlsx", index=False)
    print("Clustering done and 'cluster_label' column appended.")



Clustering done and 'cluster_label' column appended.


In [11]:
df_clustered.head(3)

Unnamed: 0,id,slug,Ethereum Mainnet,Solana Mainnet,Polygon Sidechain,Base Layer 2,Arbitrum Layer 2,Others,ERC-721,ERC-1155,...,Multiple 1:n,Price Appreciation,Staking,Rights,Social Status,Sales Price,Royalties,Loyalty,if_royalty:fee,cluster_label
0,1,zora-posts-21385,0,0,0,0,0,1,0,1,...,0,1,0,0,0,1,1,0,0.05,4
1,2,zora-posts-19156,0,0,0,0,0,1,0,1,...,0,1,0,0,0,1,1,0,0.05,4
2,3,zora-1531,0,0,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,,0


Evaluating the clusters

In [12]:
from scipy.stats import chi2_contingency

def chi_square_cluster_test(df, cluster_col="cluster_label", exclude_cols=None):
    """
    For each (binary) dimension, build a 6x2 contingency table,
    perform a chi-square test. If the table is degenerate (row or col is all zero),
    skip the test.
    """
    if exclude_cols is None:
        exclude_cols = []

    # Identify possible binary columns
    candidate_cols = [
        c for c in df.columns 
        if c not in exclude_cols and c != cluster_col
    ]

    # Gather results
    results = []
    
    # Distinct cluster labels (e.g. 0..5)
    cluster_values = sorted(df[cluster_col].unique())

    for col in candidate_cols:
        # Build 6x2 table
        contingency = [[0, 0] for _ in cluster_values]

        for i in range(len(df)):
            cluster_val = df.iloc[i][cluster_col]
            feature_val = df.iloc[i][col]
            # Convert bool -> int; skip unexpected values
            if isinstance(feature_val, bool):
                feature_val = int(feature_val)
            if feature_val not in [0, 1]:
                feature_val = 0

            row_idx = cluster_values.index(cluster_val)
            contingency[row_idx][feature_val] += 1

        # Check row sums & column sums
        row_sums = [sum(row) for row in contingency]          # length 6
        col_sums = [sum(contingency[r][c] for r in range(len(contingency))) for c in [0,1]]

        # If any row_sums or col_sums is 0 => skip
        if 0 in row_sums or 0 in col_sums:
            # This dimension is degenerate in at least one cluster (or one column).
            # We'll skip or note it as 'no variation'.
            results.append({
                "dimension": col,
                "chi2": None,
                "p_value": None,
                "dof": None,
                "note": "Skipped due to zero row/col"
            })
            continue

        # Now safe to do chi2
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append({
            "dimension": col,
            "chi2": chi2,
            "p_value": p,
            "dof": dof,
            "note": ""
        })

    res_df = pd.DataFrame(results).sort_values("p_value", na_position="last").reset_index(drop=True)
    return res_df

In [13]:
if __name__ == "__main__":

    df = pd.read_excel("clustered_6.xlsx")

    # Columns we want to exclude from the test (ID, textual columns, etc.)
    exclude_cols = ["id", "slug", "if_royalty:fee"]  

    # Run the chi-square test
    result_df = chi_square_cluster_test(df, cluster_col="cluster_label", exclude_cols=exclude_cols)

    # Print or save the results
    print(result_df.head(40))  # see dimensions by significance
    result_df.to_excel("chi_square_results.xlsx", index=False)

                  dimension        chi2       p_value  dof note
0            Solana Mainnet  185.577258  4.726729e-39    4     
1              Unstructured  179.921579  7.751394e-38    4     
2              Social Media  162.684834  3.882046e-34    4     
3    Non Ethereum Standards  137.119890  1.167222e-28    4     
4             Decentralized  127.042561  1.670173e-26    4     
5                  On Chain  116.709418  2.693563e-24    4     
6                   Mutable  100.985944  6.066368e-21    4     
7                 Immutable  100.985944  6.066368e-21    4     
8                     Cloud  100.378626  8.170291e-21    4     
9                    Rights   75.441381  1.607146e-15    4     
10                    Brand   74.953510  2.038217e-15    4     
11                     Open   67.151294  9.058413e-14    4     
12                   Events   64.037778  4.103322e-13    4     
13  Gaming & Virtual Worlds   59.379178  3.917115e-12    4     
14                  ERC-721   53.445876 

In [14]:
def cluster_dimension_proportions(df, cluster_col="cluster_label", exclude_cols=None):
    """
    Group the DataFrame by 'cluster_label' and calculate the mean (proportion of 1s)
    for each binary dimension. Returns a DataFrame where each row is a cluster,
    and each column is a dimension's proportion of 1s in that cluster.
    """
    if exclude_cols is None:
        exclude_cols = ["id", "slug", "if_royalty:fee", cluster_col]

    # Identify the binary columns to analyze
    dimensions = [c for c in df.columns if c not in exclude_cols]

    # Group by cluster and compute the mean for each dimension
    cluster_means = df.groupby(cluster_col)[dimensions].mean()

    # cluster_means is shape (n_clusters, n_dimensions)
    return cluster_means


if __name__ == "__main__":
    df_clustered = pd.read_excel("clustered_6.xlsx")
    
    # We'll skip some columns that are not binary features
    exclude = ["id", "slug", "if_royalty:fee", "cluster_label"]
    
    props_df = cluster_dimension_proportions(df_clustered, cluster_col="cluster_label", exclude_cols=exclude)
    print("\nProportions of 1 in each dimension, by cluster:\n", props_df)


Proportions of 1 in each dimension, by cluster:
                Ethereum Mainnet  Solana Mainnet  Polygon Sidechain  \
cluster_label                                                        
0                      0.100000        0.000000           0.176923   
1                      0.071429        0.000000           0.285714   
2                      0.021739        0.913043           0.021739   
3                      0.000000        0.250000           0.250000   
4                      0.000000        0.000000           0.000000   

               Base Layer 2  Arbitrum Layer 2    Others   ERC-721  ERC-1155  \
cluster_label                                                                 
0                  0.269231          0.023077  0.453846  0.538462  0.376923   
1                  0.071429          0.071429  0.642857  0.285714  0.214286   
2                  0.000000          0.000000  0.043478  0.043478  0.021739   
3                  0.031250          0.000000  0.531250  0.18750

In [15]:
def highlight_top_features(props_df):
    """
    Given a DataFrame where rows=clusters and columns=dimensions
    (with proportions of 1), compute the global proportion and difference.
    Returns two DataFrames:
      1) global_props: a single row with each dimension's global average
      2) diffs_df: same shape as props_df, where each cell = cluster_prop - global_prop
    """
    # 1) Global proportion = column means
    global_props = props_df.mean(axis=0)

    # 2) Each cell's difference from the global
    diffs_df = props_df.sub(global_props, axis='columns')  # cluster_prop - global_prop

    return global_props.to_frame("global_prop").T, diffs_df

if __name__ == "__main__":
    # Suppose we already computed props_df in the previous snippet
    global_prop, diffs = highlight_top_features(props_df)

    # diffs: each row=cluster, each col=dimension, value = difference
    # If diffs.loc[cluster, dimension] is large +, that dimension is highly characteristic of that cluster.

    # We can see top 5 features for each cluster:
    for cluster_label in diffs.index:
        row = diffs.loc[cluster_label]
        # Sort by largest positive difference
        top_dims = row.sort_values(ascending=False).head(10)
        print(f"\nCluster {cluster_label} top 10 features (above global proportion):")
        print(top_dims)


Cluster 0 top 10 features (above global proportion):
Decentralized    0.401908
Unstructured     0.347512
ERC-721          0.318735
ERC-1155         0.198355
Art              0.182250
Mutable          0.181491
Base Layer 2     0.177458
One-of-One       0.121760
Artist           0.115327
Others.1         0.065008
Name: 0, dtype: float64

Cluster 1 top 10 features (above global proportion):
Rights          0.487629
Fixed           0.439870
Brand           0.365571
Events          0.354983
Social Media    0.327605
Artist          0.195547
Real            0.167081
Membership      0.164901
Physical        0.154037
Collectible     0.153489
Name: 1, dtype: float64

Cluster 2 top 10 features (above global proportion):
Solana Mainnet             0.680435
Social Media               0.532574
Decentralized              0.473815
Open                       0.391681
Collectible                0.364669
Gaming & Virtual Worlds    0.326056
Non Ethereum Standards     0.324381
Royalties                  0

decode most common features to taxonomy

In [16]:
import pandas as pd

In [18]:
DIMENSION_MAP = {
    # ─ Technology layer ──────────────────────────────────────────
    "Blockchain Env": ["Ethereum Mainnet", "Solana Mainnet", "Polygon Sidechain",
                       "Base Layer 2", "Arbitrum Layer 2", "Others"],
    "Token Standard": ["ERC-721", "ERC-1155", "Non Ethereum Standards"],
    "Storage"       : ["On Chain", "Decentralized", "Cloud", "Physical"],
    "Mutability"    : ["Mutable", "Immutable"],
    "Transferability": ["Unlocked", "Locked"],

    # ─ Collection layer ─────────────────────────────────────────
    "Creator"       : ["Artist", "Brand", "Influencer", "Others.1"],
    "Utility"       : ["Art", "Collectible", "Membership", "Gaming & Virtual Worlds",
                       "Domain Names", "Others.2"],
    "Community Eng.": ["Unstructured", "Social Media", "Events"],
    "Supply"        : ["One-of-One", "Fixed", "Open"],

    # ─ Asset layer ──────────────────────────────────────────────
    "Asset Type"    : ["Real"],                 # ‘Digital’ always 1 → dropped
    "Ownership Form": ["Single 1:1", "Multiple 1:n"],
    "Owner Benefits": ["Price Appreciation", "Staking", "Rights", "Social Status"],
    "Creator Benefits": ["Sales Price", "Royalties", "Loyalty"],
}

# -----------------------------------------------------------------------------
# ❷  Drop non‑binary helper columns
# -----------------------------------------------------------------------------
META_COLS = {"id", "slug", "if_royalty:fee", "cluster_label"}
bin_cols  = [c for c in df_clustered.columns if c not in META_COLS]

# Ensure we really have numeric 0/1 (occasionally they arrive as strings)
df_bin = df_clustered[bin_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)
df_bin["cluster_label"] = df_clustered["cluster_label"]

# -----------------------------------------------------------------------------
# ❸  Compute per‑cluster proportions
# -----------------------------------------------------------------------------
prop = (
    df_bin
      .groupby("cluster_label")         # rows = clusters
      .mean()
)

# -----------------------------------------------------------------------------
# ❹  For every cluster pick the winner(s) inside each dimension
# -----------------------------------------------------------------------------
records = []
for cl, row in prop.iterrows():
    rec = {"Cluster": int(cl)}
    for dim, cols in DIMENSION_MAP.items():
        # skip dimensions that have no valid cols in the dataframe
        valid = [c for c in cols if c in row.index]
        if not valid:
            continue

        top_val  = row[valid].max()
        winners  = [c for c in valid if row[c] == top_val and top_val > 0]
        rec[dim] = ", ".join(winners) if winners else "–"
    records.append(rec)

archetype_summary = (
    pd.DataFrame(records)
      .set_index("Cluster")
      .sort_index()
)

print(archetype_summary.to_markdown())

|   Cluster | Blockchain Env   | Token Standard         | Storage       | Mutability   | Transferability   | Creator   | Utility     | Community Eng.   | Supply     | Asset Type   | Ownership Form   | Owner Benefits     | Creator Benefits   |
|----------:|:-----------------|:-----------------------|:--------------|:-------------|:------------------|:----------|:------------|:-----------------|:-----------|:-------------|:-----------------|:-------------------|:-------------------|
|         0 | Others           | ERC-721                | Decentralized | Mutable      | Unlocked          | Others.1  | Art         | Unstructured     | Fixed      | –            | Single 1:1       | Price Appreciation | Sales Price        |
|         1 | Others           | Non Ethereum Standards | Cloud         | Mutable      | Unlocked          | Brand     | Collectible | Social Media     | Fixed      | Real         | Single 1:1       | Price Appreciation | Sales Price        |
|         2 | Solana Mainnet

In [20]:
def dominant_characteristics(df, cluster_col="cluster_label"):
    """
    Returns a DataFrame where rows = clusters and columns = dimensions.
    Each cell contains the characteristic (column name) with the highest
    proportion of 1‑values inside that cluster.
    """
    clusters = sorted(df[cluster_col].unique())
    result   = pd.DataFrame(index=clusters)

    # pre‑compute per‑cluster proportions
    prop = (
        df.groupby(cluster_col)
          .mean(numeric_only=True)           # share of 1s for every column
    )

    for dim, cols in DIMENSION_MAP.items():
        # keep only columns that actually exist in the dataframe
        valid_cols = [c for c in cols if c in prop.columns]
        if not valid_cols:
            continue

        # argmax per cluster → column name of the dominant characteristic
        dom = prop[valid_cols].idxmax(axis=1)

        result[dim] = dom

    return result

In [22]:
dominant_characteristics(df_clustered)

Unnamed: 0,Blockchain Env,Token Standard,Storage,Mutability,Transferability,Creator,Utility,Community Eng.,Supply,Asset Type,Ownership Form,Owner Benefits,Creator Benefits
0,Others,ERC-721,Decentralized,Mutable,Unlocked,Others.1,Art,Unstructured,Fixed,Real,Single 1:1,Price Appreciation,Sales Price
1,Others,Non Ethereum Standards,Cloud,Mutable,Unlocked,Brand,Collectible,Social Media,Fixed,Real,Single 1:1,Price Appreciation,Sales Price
2,Solana Mainnet,Non Ethereum Standards,Decentralized,Mutable,Unlocked,Others.1,Collectible,Social Media,Open,Real,Single 1:1,Price Appreciation,Sales Price
3,Others,Non Ethereum Standards,Cloud,Mutable,Unlocked,Others.1,Others.2,Unstructured,One-of-One,Real,Single 1:1,Price Appreciation,Sales Price
4,Others,Non Ethereum Standards,On Chain,Immutable,Unlocked,Others.1,Others.2,Unstructured,Fixed,Real,Single 1:1,Price Appreciation,Sales Price
