In [2]:
from pathlib import Path
import polars as pl
import numpy as np
from tqdm import tqdm



In [7]:
# Paths
base_path = "/Users/brunostordeur/Docs/GitHub/Memecoins/src/"
history_folder = Path(f"{base_path}daily_data_1000/history/")
output_path = Path(f"{base_path}daily_data_1000/memecoins_features.parquet")
files = list(history_folder.glob("*.parquet"))



# --- Feature Extraction Function ---
def extract_features(df: pl.DataFrame, early_days: int = 3, full_days: list = [30, 90, 180, 365]) -> dict:
    df = df.sort("timestamp")
    if df.height < early_days:
        return None

    prices = df["price"].to_numpy()
    market_caps = df["market_cap"].to_numpy()
    volumes = df["volume"].to_numpy()

    result = {"symbol": None}
    eps = 1e-9  # To avoid divide-by-zero

    # --- Early features ---
    early_df = df.head(early_days)
    if early_df.height >= early_days:
        early_prices = early_df["price"].to_numpy()
        early_volumes = early_df["volume"].to_numpy()
        early_market_caps = early_df["market_cap"].to_numpy()
        early_log_returns = np.diff(np.log(early_prices))
        early_returns = np.diff(early_prices) / early_prices[:-1]

        result.update({
            "early_return": (early_prices[-1] - early_prices[0]) / (early_prices[0] + eps),
            "early_volatility": np.std(early_log_returns),
            "early_sharpe": np.mean(early_log_returns) / (np.std(early_log_returns) + eps),
            "early_marketcap_growth": (early_market_caps[-1] - early_market_caps[0]) / (early_market_caps[0] + eps),
            "early_volume_growth": (early_volumes[-1] - early_volumes[0]) / (early_volumes[0] + eps),
            "early_avg_volume": np.mean(early_volumes),
            "early_positive_days": np.sum(early_returns > 0),
        })

    # --- Full period features ---
    for d in full_days:
        if df.height >= d:
            d_prices = df.head(d)["price"].to_numpy()
            d_log_returns = np.diff(np.log(d_prices))
            d_returns = np.diff(d_prices) / d_prices[:-1]
            d_volumes = df.head(d)["volume"].to_numpy()
            d_market_caps = df.head(d)["market_cap"].to_numpy()

            running_max = np.maximum.accumulate(d_prices)
            drawdowns = (d_prices - running_max) / running_max
            max_dd = drawdowns.min()

            result.update({
                f"return_{d}d": (d_prices[-1] - d_prices[0]) / (d_prices[0] + eps),
                f"volatility_{d}d": np.std(d_log_returns),
                f"sharpe_{d}d": np.mean(d_log_returns) / (np.std(d_log_returns) + eps),
                f"max_drawdown_{d}d": max_dd,
                f"volume_growth_{d}d": (d_volumes[-1] - d_volumes[0]) / (d_volumes[0] + eps),
                f"marketcap_growth_{d}d": (d_market_caps[-1] - d_market_caps[0]) / (d_market_caps[0] + eps),
            })

    # --- Clean and fill problematic values ---
    clean_result = {}
    for k, v in result.items():
        if isinstance(v, float):
            if not np.isfinite(v) or np.isnan(v):
                clean_result[k] = 0.0  # or -999 if you want sentinel values
            else:
                clean_result[k] = v
        else:
            clean_result[k] = v  # keep symbol or other non-floats

    return clean_result


# --- Process all memecoins ---




In [8]:
all_features = []
processed = []
failed = []

for file in tqdm(files):
    symbol = file.stem
    try:
        df = pl.read_parquet(file)
        features = extract_features(df)
        if features:
            features["symbol"] = symbol
            all_features.append(features)
            processed.append(symbol)
        else:
            failed.append(symbol)  # too short or empty data
    except Exception:
        failed.append(symbol) 



100%|██████████| 761/761 [00:00<00:00, 1147.29it/s]


In [22]:
# Load expected list of symbols
memecoins_list = pl.read_parquet(f"{base_path}daily_data_1000/memecoins_list.parquet")
expected_symbols = set(memecoins_list["id"])

# Find missing
actual_symbols = set([f.stem.replace("_daily", "") for f in all_files])
missing_symbols = expected_symbols - actual_symbols

print(f"Missing history files: {len(missing_symbols)}")


Missing history files: 239


In [23]:
missing_symbols

{'abe-cto',
 'agent-s',
 'ai-inu',
 'ai16zeliza',
 'airene',
 'anarchy-2',
 'andy-4',
 'andyerc',
 'andyman',
 'ape-man',
 'apu-apustaja-base',
 'aquarius-2',
 'aries',
 'avarik-saga',
 'baby-grok',
 'baby-wen',
 'barry-the-badger',
 'based',
 'beamcat',
 'beevo',
 'biao-on-sol',
 'big',
 'big-tony',
 'bitcat-3',
 'bitcone',
 'blackrack',
 'blinks-gg',
 'bm',
 'book-of-dyor',
 'book-of-miggles',
 'booshi',
 'bop-cat',
 'bored-2',
 'boys-club',
 'briun-armstrung',
 'broke-again',
 'bsop',
 'bully-3',
 'cancer',
 'capricorn-2',
 'catalorian',
 'catbat',
 'catgpt',
 'cats-in-the-sats',
 'chain-talk-daily',
 'cheems',
 'chelon',
 'chonk-2',
 'chuck',
 'clank-fun',
 'clapcat',
 'coal-2',
 'coconut-chicken',
 'coin-on-base',
 'coin6900',
 'coinye-west',
 'coq-ai',
 'costco-hot-dog',
 'craze',
 'crt-ai-network',
 'cryptoboomcoin-official',
 'cute-asian-girl',
 'cyberdoge-3',
 'cz-on-hyperliquid',
 'dank-memes',
 'dark-maga',
 'deep-whales-ai',
 'defrogs',
 'deli-fm',
 'department-of-gov-effic

In [33]:
import sys
from pathlib import Path

# Add the src directory to the Python path
sys.path.append(str(Path.cwd().parent.parent))  # This goes up two levels from /notebooks/explorations to the root

from src.memecoin_utils import get_memecoins, get_coin_snapshot, get_coin_history

df = get_coin_history("airene", frequency='daily')
print(df.head())



   timestamp     price    market_cap        volume
0 2024-11-12  0.003422  0.000000e+00  1.073076e+06
1 2024-11-13  0.003422  0.000000e+00  1.073076e+06
2 2024-11-14  0.003541  3.539393e+06  1.085324e+06
3 2024-11-15  0.003223  3.218287e+06  1.146213e+06
4 2024-11-16  0.003605  3.605915e+06  1.294451e+06


In [32]:
import os
os.getcwd()

'/Users/brunostordeur/Docs/GitHub/Memecoins/notebooks/exploration'

In [34]:
# --- Save to disk ---
import os
from pathlib import Path

small_files = [f for f in Path("daily_data_1000/history").glob("*_daily.parquet") if os.path.getsize(f) < 5000]
print(f"Very small (probably empty) files: {len(small_files)}")

Very small (probably empty) files: 0


In [None]:
# --- Save to disk ---
features_df.write_parquet(output_path)