## Change these if not running over BAYC

In [None]:
COLLECTION = 'boredapeyachtclub'
COLLECTION_SIZE = 10000

collection_file = f"{COLLECTION}.ndjson"
traits_file = f"{COLLECTION}_traits.tsv"

# Load Collection Data (only needs to run once per collection)

In [None]:
import json
import requests
import time

MAX_RES = 50

# fetch a single page of results from opensea, return JSON
def fetch_page(offset):
    p = {
        'order_direction': 'asc',
        'offset': offset,
        'limit': MAX_RES,
        'collection': COLLECTION,
    }
    r = requests.get('https://api.opensea.io/api/v1/assets', params=p)
    return r.json()


In [None]:
cur_offset = 0
end_offset = COLLECTION_SIZE
# using append to make retries easier
with open(collection_file, 'a') as of:
    while cur_offset < end_offset:
        res = fetch_page(cur_offset)
        for a in res['assets']:
            of.write(json.dumps(a))
            of.write('\n')
        time.sleep(0.25)
        cur_offset += MAX_RES

# Get Traits

In [None]:
import json

def trait_name(trait_json):
    return f"{trait_json['trait_type']}_{trait_json['value'].replace(' ', '-')}"

traits = {}

with open(collection_file, 'r') as f:
    for l in f.readlines():
        j = json.loads(l)
        for t in j['traits']:
            traits[trait_name(t)] = t['trait_count']
    with open(traits_file, 'w') as of:
        for t in sorted(traits):
            of.write(f"{t}\t{traits[t]}\n")

# Analysis

## Setup

In [None]:
from datetime import datetime
from datetime import timedelta
import json
from scipy.stats import pearsonr
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [None]:
def trait_name(trait_json):
    return f"{trait_json['trait_type']}_{trait_json['value'].replace(' ', '-')}"

In [None]:
# load traits and make helper dicts

trait_to_count = {}
trait_to_idx = {}
idx_to_trait = {}

with open(traits_file, 'r') as f:
    for idx, l in enumerate(f.readlines()):
        trait, count = l.split()
        trait_to_count[trait] = count
        trait_to_idx[trait] = idx
        idx_to_trait[idx] = trait

In [None]:
def json_to_trait_features(in_json):
    """Take asset json from OpenSea, extract traits into a feature vector."""
    features = [0] * len(trait_to_idx)
    for t in in_json['traits']:
        trait = trait_name(t)
        features[trait_to_idx[trait]] = 1
    return features

def json_to_sell_order_price(in_json):
    """Prices are based on the current sell orders, which probably can be improved. Denominated in ETH. (Double check these, OpenSea API is a bit weird here.)"""
    if in_json['sell_orders'] is None:
        return None
    so = in_json['sell_orders'][0]
    if so['payment_token_contract']['symbol'] not in ['ETH']:
        return None # need to understand better, OpenSea seems to be doing weird stuff w/USDC sales.
    eth_price = float(so['payment_token_contract']['eth_price'])
    return (float(so['current_price']) / 1000000000000000000.0) * eth_price

def convert_ts_string(ts_str):
    """Strips fractional seconds before conversion."""
    return datetime.strptime(ts_str.split('.')[0], '%Y-%m-%dT%H:%M:%S')

def json_to_last_sale_price(in_json, in_last_days=30):
    """Prices are based on prior sales. Denominated in ETH."""
    if in_json['last_sale'] is None:
        return None
    s = in_json['last_sale']
    sale_ts = convert_ts_string(s['event_timestamp'])
    cutoff = datetime.now() - timedelta(days=in_last_days)
    if sale_ts < cutoff:
        return None
    if s['payment_token']['symbol'] not in ['ETH', 'WETH']:
        return None # need to understand better, OpenSea seems to be doing weird stuff w/USDC sales.
    eth_price = float(s['payment_token']['eth_price'])
    return (float(s['total_price']) / 1000000000000000000.0) * eth_price

In [None]:
def trait_weights(model, unseen_traits):
    """sorted weights from sklearn model"""
    trait_weights = [(t, model.coef_[trait_to_idx[t]]) for t in trait_to_idx if t not in unseen_traits]
    trait_weights.sort(key=lambda x: x[1])
    return trait_weights

def price_delta(in_model, asset):
    """Gives model prediction - actual price (positive is 'undervalued', negative is 'overvalued' assuming the model is correct (it's not))"""
    if asset[1] is None:
        return None
    return in_model.predict([asset[0]])[0] - asset[1]

def price_premium_percentage(in_model, asset):
    """Gives model prediction / actual price (> 1 is 'undervalued', < 1 is 'overvalued' assuming the model is correct (it's not))"""
    if asset[1] is None:
        return None
    return in_model.predict([asset[0]])[0] / asset[1]

def feature_vector_to_traits(fv):
    return [idx_to_trait[idx] for idx in range(len(fv)) if fv[idx]]            

def good_asset_price(price):
    return price is not None and price > 10

def explained_score(eval_model, asset):
    print(f"Average price: {eval_model.intercept_}")
    for idx, v in enumerate(asset[0]):
        if v:
            print(f"\t{idx_to_trait[idx]}\t{eval_model.coef_[idx]}")
    print(f"Total: {eval_model.predict([asset[0]])[0]}")

def eval_model(eval_model, X, Y):
    print(f"Score: {eval_model.score(X,Y)}")
    print(f"Num Features: {len(Y)}")
    deltas = [(k, price_delta(eval_model, assets[k])) for k in assets if good_asset_price(assets[k][1])]
    deltas.sort(key=lambda x: x[1])
    print(f"Overvalued sales (absolute): {deltas[:3]}")
    print(f"Undervalued sales (absolute): {deltas[-3:]}")
    deltas = [(k, price_premium_percentage(eval_model, assets[k])) for k in assets if good_asset_price(assets[k][1])]
    deltas.sort(key=lambda x: x[1])
    print(f"Overvalued sales (relative): {deltas[:3]}")
    print(f"Undervalued sales (relative): {deltas[-3:]}")
    # correlation between trait count and trait coefficient (i.e. rarity impact)
    trait_counts = [float(trait_to_count[idx_to_trait[idx]]) for idx in range(len(trait_to_count))]
    print(f"Trait count / price impact correlation: {pearsonr(eval_model.coef_, trait_counts)}")
    unseen_traits = [t for t in trait_to_idx if trait_to_idx[t] not in seen_feature_idx]
    unseen_traits.sort()
    print("\nUnseen traits:")
    for t in unseen_traits:
        print(f"\t{t}")
    print("\nTrait weights:")
    for tw in trait_weights(eval_model, unseen_traits):
        if tw[1]:
            print(f"\t{tw[0]}\t{tw[1]}")
            
def find_deals(model, assets, top_n=5):
    def all_traits_seen(asset):
        for idx, v in enumerate(asset[0]):
            if v and (idx not in seen_feature_idx):
                return False
        return True
    sale_deltas = [(aid, model.predict([assets[aid][0]])[0], assets[aid][2]) for aid in assets if assets[aid][2] and all_traits_seen(assets[aid])]
    deals = list(filter(lambda x: x[1] - x[2] > 0, sale_deltas))
    deals.sort(key=lambda x: x[2] - x[1])
    print(f"Deals (total {len(deals)}/{len(sale_deltas)})\t(id, predicted price, sale price, delta)")
    for x in deals[:top_n]:
        print(f"\t{x[0]}\t{x[1]}\t{x[2]}\t{x[1]-x[2]}")

In [None]:
assets = {} # dict from asset id to (features, last sale price, current list price)

with open(collection_file, 'r') as f:
    for l in f.readlines():
        json_asset = json.loads(l)
        assets[json_asset['token_id']] = (json_to_trait_features(json_asset), json_to_last_sale_price(json_asset, in_last_days=30), json_to_sell_order_price(json_asset))

In [None]:
X = []
Y = []

seen_feature_idx = set()

for v in assets.values():
    if good_asset_price(v[1]):
        X.append(v[0])
        Y.append(v[1])
        for i, v in enumerate(v[0]):
            if v:
                seen_feature_idx.add(i)

## Linear Regression

In [None]:
model = LinearRegression(fit_intercept=False).fit(X, Y)
eval_model(model, X, Y)
find_deals(model, assets)

## Lasso

In [None]:
lmodel = Lasso().fit(X, Y)
eval_model(lmodel, X, Y)

## Elastic Net

In [None]:
elmodel = ElasticNet().fit(X, Y)
eval_model(elmodel, X, Y)

## Ridge

In [None]:
rmodel = Ridge().fit(X,Y)
eval_model(rmodel, X, Y)
find_deals(model, assets, top_n=50)
