# UniswapV3 retail vs sophisticated participants

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine


In [3]:
conn = create_engine('postgresql://user:pass@localhost:5432/db')


df = pd.read_sql("""SELECT * 
FROM uniswap_v3_deals uvd 
INNER JOIN uniswap_v3_events uve ON uvd.blockchain_event_id = uve.id
WHERE uve.type != 'Swap'""", conn)

Lets see the dataset we have

In [None]:
df.head()

In [None]:
df.columns

## Phase 1: Sophisticated criteria

We have several criteria on whether wallet is sophisticated:

- top 5th percentile of `max(mint_tx)` per wallet
- there is a position with 1 million usd minted
- top 5th percentile of `count(mint_tx)` per wallet
- top 5th percentile of `count(distinct(pool_address))` per wallet
- top 5th percentile of `count(liquidity_tx)` per wallet

### Criterion 1: Max mint volume in one tx per wallet

In [None]:
wallet_max_mint_volume_usd = (
    df[df['type'] == 'Mint']
    .assign(max_mint_volume_usd=lambda x: x['volume_tokens_a_in_usd'] + x['volume_tokens_b_in_usd'])
    .groupby('wallet_address')['max_mint_volume_usd']
    .max()
    .reset_index()
)

wallet_max_mint_volume_usd_percentile_95 = wallet_max_mint_volume_usd['max_mint_volume_usd'].quantile(0.99)
print(f"95th percentile threshold: ${wallet_max_mint_volume_usd_percentile_95:,.2f}")

wallet_max_mint_volume_usd['is_sophisticated_by_max_mint_volume_usd'] = np.where(
    wallet_max_mint_volume_usd['max_mint_volume_usd'] >= wallet_max_mint_volume_usd_percentile_95,
    1,
    0   # Bottom 95% (retail)
)
wallet_max_mint_volume_usd.sort_values("max_mint_volume_usd")

Binary criteria `is_sophisticated_by_max_mint_volume_usd`:
- 0 is retail
- 1 is sophisticated

### Criterion 2: wallet has at least one position with 1 million usd minted

In [None]:
wallet_mint_volume_1_mln_usd = (
    df[df['type'] == 'Mint']
    .assign(mint_volume=lambda x: x['volume_tokens_a_in_usd'] + x['volume_tokens_b_in_usd'])
    .groupby('wallet_address')
    .agg(
        max_mint_volume_usd_for_criterion_2=('mint_volume', 'max'),
        is_sophisticated_by_mint_volume_1_mln_usd=('mint_volume', lambda x: int((x >= 1000000).any()))
    )
    .reset_index()
)
wallet_mint_volume_1_mln_usd.sort_values("max_mint_volume_usd_for_criterion_2")

Binary criteria `is_sophisticated_by_max_mint_volume_usd`:
- 0 is retail
- 1 is sophisticated

### Criterion 3: top 5th percentile of amount of mint txes by wallet

In [None]:
wallet_mint_txes_count = (
    df[df['type'] == 'Mint']
    .groupby('wallet_address')
    .size()
    .reset_index(name='mint_tx_count')
)

wallet_mint_txes_count_percentile_95 = wallet_mint_txes_count['mint_tx_count'].quantile(0.95)
print(f"95th percentile threshold: {wallet_mint_txes_count_percentile_95:,.2f}")

wallet_mint_txes_count['is_sophisticated_by_mint_txes_count'] = np.where(
    wallet_mint_txes_count['mint_tx_count'] >= wallet_mint_txes_count_percentile_95,
    1,
    0   # Bottom 95% (retail)
)
wallet_mint_txes_count.head()

Binary criteria `wallet_mint_txes_count`:
- 0 is retail
- 1 is sophisticated

### Criterion 4: top 5th percentile of count of distinct pool where address participated


In [None]:
distinct_pool_interacted_count = df.groupby('wallet_address')['pool_address'].nunique().reset_index()
distinct_pool_interacted_count.columns = ['wallet_address', 'distinct_pool_interacted_count']
distinct_pool_interacted_count_percentile_95 = distinct_pool_interacted_count["distinct_pool_interacted_count"].quantile(0.95)
print(f"95th percentile threshold: {distinct_pool_interacted_count_percentile_95:,.2f}")

distinct_pool_interacted_count['is_sophisticated_by_distinct_pools_count'] = np.where(
    distinct_pool_interacted_count['distinct_pool_interacted_count'] >= distinct_pool_interacted_count_percentile_95,
    1,
    0   # Bottom 95% (retail)
)

distinct_pool_interacted_count



### Criterion 5: top 5th percentile of count of interactions with UniswapV3 pools

In [None]:
wallet_all_txes_count = (
    df
    .groupby('wallet_address')
    .size()
    .reset_index(name='tx_count')
)

wallet_all_txes_count_percentile_95 = wallet_all_txes_count['tx_count'].quantile(0.95)
print(f"95th percentile threshold: {wallet_all_txes_count_percentile_95:,.2f}")

wallet_all_txes_count['is_sophisticated_by_all_txes_count'] = np.where(
    wallet_all_txes_count['tx_count'] >= wallet_all_txes_count_percentile_95,
    1,
    0   # Bottom 95% (retail)
)
wallet_all_txes_count

## Sum up: who is sophisticated

If wallet has two or more criteria he is sophisticated -> he is sophisticated

In [None]:
merged = wallet_max_mint_volume_usd.merge(wallet_mint_volume_1_mln_usd, on="wallet_address", how="outer")
merged = merged.merge(wallet_mint_txes_count, on="wallet_address", how="outer")
merged = merged.merge(wallet_all_txes_count, on="wallet_address", how="outer")
merged = merged.merge(distinct_pool_interacted_count, on="wallet_address", how="outer")
cleaned = merged.dropna()

cleaned

In [None]:
is_sophisticated_bool = cleaned.assign(
    is_sophisticated_bool=lambda x: (
        x['is_sophisticated_by_max_mint_volume_usd'] +
        x['is_sophisticated_by_mint_volume_1_mln_usd'] +
        x['is_sophisticated_by_mint_txes_count'] +
        x['is_sophisticated_by_all_txes_count'] +
        x['is_sophisticated_by_distinct_pools_count']
    ) >= 1
)
is_sophisticated = is_sophisticated_bool.assign(
    is_sophisticated=lambda x: x.is_sophisticated_bool.astype(int)
)

is_retail = is_sophisticated.assign(is_retail=lambda x: abs(x.is_sophisticated - 1))
is_retail

In [None]:
prettified = is_retail.drop(['max_mint_volume_usd_for_criterion_2', 'is_sophisticated_by_max_mint_volume_usd', 'is_sophisticated_by_mint_volume_1_mln_usd', 'is_sophisticated_by_mint_txes_count', 'is_sophisticated_by_all_txes_count'], axis=1)
classified = prettified
sophisticated_count = classified[classified['is_retail'] == 0].size
retail_count = classified[classified['is_retail'] == 1].size
f"There is {sophisticated_count} sophisticated and {retail_count} retail"

### Before building model: tickrange spread

We have already classified participants by binary predictor, but for complete picture we need to see avg tickrange spread of every wallet positions


In [None]:
normalize = 1
price_spread = df.assign(price_spread=lambda x: normalize * abs(x.price_upper - x.price_lower))
price_spread = price_spread[price_spread["type"] == "Mint"].groupby("wallet_address").max().reset_index()
classified = classified.merge(price_spread[["wallet_address", "price_spread"]], on="wallet_address")

In [None]:
classified

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# List of criteria to iterate through
criteria_list = ['max_mint_volume_usd', "mint_tx_count", "tx_count", "price_spread"]

# Calculate subplot layout
n_plots = len(criteria_list)
n_cols = min(2, n_plots)
n_rows = (n_plots + n_cols - 1) // n_cols

# Create figure with subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(10 * n_cols, 6 * n_rows))

# Ensure axes is always a list for consistent indexing
if n_plots == 1:
    axes = [axes]
else:
    axes = axes.flatten()

# Iterate through criteria
for idx, criteria in enumerate(criteria_list):
    plot_data = classified[['is_retail', criteria]].dropna()
    x = plot_data[['is_retail']]
    y = plot_data[criteria]

    # Add constant for intercept
    X = sm.add_constant(x)

    # Fit OLS model
    model = sm.OLS(y, X).fit()
    print(f"Regression summary for {criteria}:")
    print(model.summary())

    # Plot on the corresponding subplot
    ax = axes[idx]

    x_line = np.linspace(0, 1, 100)
    b = model.params[0]
    k = model.params[1]
    y_line = k * x_line + b

    ax.scatter(x, y, alpha=0.5, label='Data')
    ax.plot(x_line, y_line, 'r-')
    ax.set_ylabel(f'{criteria}')
    ax.set_xlabel('Is Retail (0 = Sophisticated, 1 = Retail)')
    ax.set_title(f'Linear Regression: {criteria} Retail Classification')

    # Apply log scale to y-axis
    ax.set_yscale('log')
    ax.set_ylabel(f'{criteria} (log scale)')

    ax.legend()
    ax.grid(True, alpha=0.3)

# Hide any unused subplots
for idx in range(n_plots, len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()