# Stablecoin Data Collection

This notebook collects data from multiple sources for stablecoin depeg prediction:

| Source | Data | API Key Required |
|--------|------|------------------|
| **Binance** | Trading data (OHLCV, spread, buy pressure) | No |
| **DefiLlama** | Supply metrics, chain distribution | No |
| **CoinGecko** | Direct USD prices, market cap | Yes (free tier) |
| **Fear & Greed** | Market sentiment index | No |
| **Etherscan** | On-chain transfers, whale movements | Yes (free tier) |

---
## Setup

In [None]:
import os
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd

# Add project root to path
PROJECT_ROOT = Path('.').resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

from config.settings import RAW_DATA_DIR, PROCESSED_DATA_DIR

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data dir: {RAW_DATA_DIR}")
print(f"Processed data dir: {PROCESSED_DATA_DIR}")

In [None]:
# API Keys - Set these before running collection cells
# You can also set them as environment variables

COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY", "")  # Get free key at https://www.coingecko.com/en/api
ETHERSCAN_API_KEY = os.getenv("ETHERSCAN_API_KEY", "")  # Get free key at https://etherscan.io/apis

# Uncomment and set your keys here if not using environment variables:
# COINGECKO_API_KEY = "your-coingecko-key-here"
# ETHERSCAN_API_KEY = "your-etherscan-key-here"

print(f"CoinGecko API key: {'✓ Set' if COINGECKO_API_KEY else '✗ Not set'}")
print(f"Etherscan API key: {'✓ Set' if ETHERSCAN_API_KEY else '✗ Not set'}")

In [None]:
# Configuration
COINS = ['usdt', 'usdc']  # Stablecoins to collect
START_DATE = datetime(2020, 1, 1)  # Start date for historical data

print(f"Collecting data for: {COINS}")
print(f"Start date: {START_DATE.date()}")

---
## 1. Binance Trading Data

Collects hourly OHLCV data from Binance for BTC/stablecoin pairs.

**Features collected:**
- Open, High, Low, Close prices
- Volume and quote volume
- Number of trades
- Spread proxy (high-low / close)
- Buy ratio (taker buy volume / total volume)

In [None]:
from src.data.collect_binance import BinanceCollector

binance = BinanceCollector()

print("="*60)
print("BINANCE DATA COLLECTION")
print("="*60)

binance_results = {}

for coin in COINS:
    print(f"\n--- {coin.upper()} ---")
    try:
        data = binance.collect_stablecoin_trading_data(
            coin_key=coin,
            start_date=START_DATE
        )
        
        # Save data
        binance.save_data(data, coin)
        
        # Summary
        for pair, df in data.items():
            print(f"  {pair}: {len(df):,} records")
            print(f"    Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        
        binance_results[coin] = data
        
    except Exception as e:
        print(f"  Error: {e}")

print("\n✓ Binance collection complete")

In [None]:
# Preview Binance data
if binance_results:
    coin = COINS[0]
    pair = list(binance_results[coin].keys())[0]
    print(f"Sample: {pair}")
    display(binance_results[coin][pair].tail())

---
## 2. DefiLlama Supply Data

Collects stablecoin supply metrics from DefiLlama (free API, no key needed).

**Features collected:**
- Total circulating supply
- Implied price (USD value / token count)
- Daily supply change %
- 7-day supply change %

In [None]:
from src.data.collect_defillama import DefiLlamaCollector

defillama = DefiLlamaCollector()

print("="*60)
print("DEFILLAMA DATA COLLECTION")
print("="*60)

defillama_results = {}

for coin in COINS:
    print(f"\n--- {coin.upper()} ---")
    try:
        # Main metrics
        df = defillama.collect_stablecoin_data(coin)
        defillama.save_data(df, coin, "_metrics")
        
        print(f"  Metrics: {len(df):,} records")
        print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        print(f"  Latest supply: ${df['total_circulating_usd'].iloc[-1]/1e9:.2f}B")
        
        defillama_results[coin] = df
        
        # Chain distribution (optional)
        try:
            chain_df = defillama.collect_chain_distribution(coin)
            defillama.save_data(chain_df, coin, "_chains")
            print(f"  Chain data: {len(chain_df):,} records ({chain_df['chain'].nunique()} chains)")
        except Exception as e:
            print(f"  Chain data: skipped ({e})")
        
    except Exception as e:
        print(f"  Error: {e}")

print("\n✓ DefiLlama collection complete")

In [None]:
# Preview DefiLlama data
if defillama_results:
    coin = COINS[0]
    print(f"Sample: {coin.upper()} DefiLlama metrics")
    display(defillama_results[coin].tail())

---
## 3. CoinGecko Price Data

Collects direct USD prices from CoinGecko (requires free API key).

**Features collected:**
- Direct USD price (not derived from BTC pair)
- Market cap
- Trading volume
- Price deviation from peg

**Note:** Demo API limited to 365 days of history.

In [None]:
from src.data.collect_prices import CoinGeckoCollector

print("="*60)
print("COINGECKO DATA COLLECTION")
print("="*60)

if not COINGECKO_API_KEY:
    print("\n⚠ COINGECKO_API_KEY not set. Skipping CoinGecko collection.")
    print("  Get a free key at: https://www.coingecko.com/en/api")
    coingecko_results = {}
else:
    # Set the API key in environment for the collector
    os.environ["COINGECKO_API_KEY"] = COINGECKO_API_KEY
    coingecko = CoinGeckoCollector()
    
    coingecko_results = {}
    
    for coin in COINS:
        print(f"\n--- {coin.upper()} ---")
        try:
            df = coingecko.collect_stablecoin_data(coin, days=365)
            coingecko.save_data(df, coin)
            
            print(f"  Records: {len(df):,}")
            print(f"  Date range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
            print(f"  Price range: ${df['price'].min():.4f} to ${df['price'].max():.4f}")
            print(f"  Max deviation: {df['abs_deviation'].max()*100:.4f}%")
            
            coingecko_results[coin] = df
            
        except Exception as e:
            print(f"  Error: {e}")
    
    print("\n✓ CoinGecko collection complete")

In [None]:
# Preview CoinGecko data
if coingecko_results:
    coin = COINS[0]
    print(f"Sample: {coin.upper()} CoinGecko prices")
    display(coingecko_results[coin].tail())

---
## 4. Fear & Greed Index

Collects the Crypto Fear & Greed Index (free API, no key needed).

**Features collected:**
- Fear & Greed value (0-100)
- Classification (Extreme Fear, Fear, Neutral, Greed, Extreme Greed)

**Coverage:** Data available since February 2018.

In [None]:
from src.data.collect_market import MarketDataCollector

market = MarketDataCollector()

print("="*60)
print("FEAR & GREED INDEX COLLECTION")
print("="*60)

try:
    fng_df = market.get_fear_greed_index(limit=0)  # 0 = all available
    market.save_data(fng_df, "market_fear_greed.csv")
    
    print(f"\nRecords: {len(fng_df):,}")
    print(f"Date range: {fng_df['timestamp'].min().date()} to {fng_df['timestamp'].max().date()}")
    print(f"Value range: {fng_df['fear_greed_value'].min()} to {fng_df['fear_greed_value'].max()}")
    print(f"Mean: {fng_df['fear_greed_value'].mean():.1f}")
    
    print("\nClass distribution:")
    print(fng_df['fear_greed_class'].value_counts().to_string())
    
    print("\n✓ Fear & Greed collection complete")
    
except Exception as e:
    print(f"Error: {e}")
    fng_df = None

In [None]:
# Preview Fear & Greed data
if fng_df is not None:
    print("Sample: Fear & Greed Index (recent)")
    display(fng_df.head(10))

---
## 5. Stablecoin Market Share

Collects current market share snapshot of all stablecoins.

In [None]:
print("="*60)
print("STABLECOIN MARKET SHARE")
print("="*60)

try:
    market_df = market.get_stablecoin_market_share()
    market.save_data(market_df, "market_stablecoin_share.csv")
    
    print(f"\nStablecoins tracked: {len(market_df)}")
    print(f"\nTop 10 by market cap:")
    
    top10 = market_df.head(10)[['symbol', 'name', 'current_price', 'market_cap']].copy()
    top10['market_cap'] = top10['market_cap'].apply(lambda x: f"${x/1e9:.2f}B")
    top10['current_price'] = top10['current_price'].apply(lambda x: f"${x:.4f}")
    display(top10)
    
    print("\n✓ Market share collection complete")
    
except Exception as e:
    print(f"Error: {e}")
    market_df = None

---
## 6. Etherscan On-Chain Data

Collects on-chain transfer data from Etherscan (requires free API key).

**Features collected:**
- Recent token transfers
- Transfer values and gas prices
- Whale transactions (>$1M)
- Unique senders/receivers

**Note:** Free tier limited to recent transactions only.

In [None]:
from src.data.collect_onchain import EtherscanCollector

print("="*60)
print("ETHERSCAN ON-CHAIN DATA COLLECTION")
print("="*60)

if not ETHERSCAN_API_KEY:
    print("\n⚠ ETHERSCAN_API_KEY not set. Skipping on-chain collection.")
    print("  Get a free key at: https://etherscan.io/apis")
    etherscan_results = {}
else:
    os.environ["ETHERSCAN_API_KEY"] = ETHERSCAN_API_KEY
    etherscan = EtherscanCollector()
    
    etherscan_results = {}
    
    for coin in COINS:
        print(f"\n--- {coin.upper()} ---")
        try:
            df = etherscan.collect_transfer_metrics(coin, num_pages=10)
            
            if not df.empty:
                etherscan.save_data(df, coin, "_transfers")
                
                # Aggregate to hourly
                agg_df = etherscan.aggregate_transfer_metrics(df)
                etherscan.save_data(agg_df, coin, "_hourly")
                
                whale_count = (df['value'] > 1_000_000).sum()
                
                print(f"  Transfers: {len(df):,}")
                print(f"  Time span: {df['timestamp'].min()} to {df['timestamp'].max()}")
                print(f"  Total volume: ${df['value'].sum():,.0f}")
                print(f"  Whale transfers (>$1M): {whale_count}")
                
                etherscan_results[coin] = df
            else:
                print("  No data returned")
                
        except Exception as e:
            print(f"  Error: {e}")
    
    print("\n✓ Etherscan collection complete")

In [None]:
# Preview Etherscan data
if etherscan_results:
    coin = COINS[0]
    print(f"Sample: {coin.upper()} recent transfers")
    display(etherscan_results[coin].head())

---
## 7. Merge All Sources

Combine all data sources into processed files for modeling.

In [None]:
from src.data.merge_sources import create_processed_files

print("="*60)
print("MERGING ALL DATA SOURCES")
print("="*60)

combined_df = create_processed_files()

In [None]:
# Preview merged data
if combined_df is not None and not combined_df.empty:
    print("\nMerged data sample:")
    display(combined_df.tail())
    
    print(f"\nColumns: {list(combined_df.columns)}")

---
## 8. Collection Summary

In [None]:
import os

print("="*60)
print("DATA COLLECTION SUMMARY")
print("="*60)

# List all collected files
print("\nRaw data files:")
print("-"*40)
if RAW_DATA_DIR.exists():
    for f in sorted(RAW_DATA_DIR.glob("*.csv")):
        size = f.stat().st_size / 1024  # KB
        if size > 1024:
            size_str = f"{size/1024:.1f} MB"
        else:
            size_str = f"{size:.1f} KB"
        print(f"  {f.name:40s} {size_str:>10s}")

print("\nProcessed data files:")
print("-"*40)
if PROCESSED_DATA_DIR.exists():
    for f in sorted(PROCESSED_DATA_DIR.glob("*.csv")):
        size = f.stat().st_size / 1024
        if size > 1024:
            size_str = f"{size/1024:.1f} MB"
        else:
            size_str = f"{size:.1f} KB"
        print(f"  {f.name:40s} {size_str:>10s}")

In [None]:
# Final data quality check
if combined_df is not None and not combined_df.empty:
    print("\nDATA QUALITY CHECK")
    print("="*60)
    print(f"Total records: {len(combined_df):,}")
    print(f"Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
    print(f"Coins: {combined_df['coin'].unique().tolist()}")
    
    print("\nMissing values:")
    missing = combined_df.isna().sum()
    missing = missing[missing > 0]
    if len(missing) > 0:
        for col, count in missing.items():
            pct = count / len(combined_df) * 100
            print(f"  {col}: {count:,} ({pct:.1f}%)")
    else:
        print("  None!")
    
    print("\n✓ Data collection complete!")
    print("\nNext steps:")
    print("  1. Run descriptive_analytics.ipynb for EDA")
    print("  2. Run feature engineering and modeling")

---
## Appendix: Manual Data Collection

Use these cells to collect data from individual sources manually.

In [None]:
# Manual: Collect single coin from Binance
# Uncomment to run:

# from src.data.collect_binance import BinanceCollector
# collector = BinanceCollector()
# data = collector.collect_stablecoin_trading_data('usdt', start_date=datetime(2023, 1, 1))
# collector.save_data(data, 'usdt')

In [None]:
# Manual: Update only CoinGecko data (last 365 days)
# Uncomment to run:

# from src.data.collect_prices import CoinGeckoCollector
# os.environ["COINGECKO_API_KEY"] = "your-key-here"
# collector = CoinGeckoCollector()
# for coin in ['usdt', 'usdc']:
#     df = collector.collect_stablecoin_data(coin, days=365)
#     collector.save_data(df, coin)

In [None]:
# Manual: Re-merge all sources
# Uncomment to run:

# from src.data.merge_sources import create_processed_files
# df = create_processed_files()