# 01 — Data Download
Download BTC/USDT OHLCV data (15m, 1H, 4H) from Binance via CCXT.
Caches to parquet files on Google Drive for persistence.

In [None]:
# Install dependencies
# torch, numpy, pandas are pre-installed by Colab — do NOT reinstall them.
# Pinning versions fights Colab's environment and causes resolver conflicts.
!pip install -q xgboost ccxt PyWavelets hmmlearn numba scikit-learn pyyaml \
    tensorboard tqdm pyarrow

In [None]:
# Mount Google Drive and clone/pull repo
from google.colab import drive
drive.mount('/content/drive')

import os
REPO_DIR = '/content/scalp2'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/<YOUR_USERNAME>/scalp2.git {REPO_DIR}
else:
    !cd {REPO_DIR} && git pull

import sys
sys.path.insert(0, REPO_DIR)

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')

from scalp2.config import load_config
config = load_config(f'{REPO_DIR}/config.yaml')

# Colab often hits HTTP 451 on Binance USD-M endpoints.
# Use Bybit linear perpetual by default in Colab.
if config.data.exchange == 'binanceusdm':
    config.data.exchange = 'bybit'
    config.data.symbol = 'BTC/USDT:USDT'

# Override cache dir to Google Drive for persistence
config.data.cache_dir = '/content/drive/MyDrive/scalp2/data/raw'
config.data.processed_dir = '/content/drive/MyDrive/scalp2/data/processed'

os.makedirs(config.data.cache_dir, exist_ok=True)
os.makedirs(config.data.processed_dir, exist_ok=True)

print(f'Symbol: {config.data.symbol}')
print(f'Date range: {config.data.date_range.start} to {config.data.date_range.end}')
print(f'Timeframes: {config.data.timeframes.primary}, {config.data.timeframes.mtf}')

In [None]:
from scalp2.data.downloader import OHLCVDownloader

downloader = OHLCVDownloader(config.data)
data = downloader.fetch_all(use_cache=True)

for tf, df in data.items():
    print(f'{tf}: {len(df)} bars, {df.index[0] if hasattr(df, "index") else df["timestamp"].iloc[0]} → {df.index[-1] if hasattr(df, "index") else df["timestamp"].iloc[-1]}')

In [None]:
# Download funding rate data
funding_df = downloader.fetch_funding_rate(use_cache=True)
print(f'Funding rates: {len(funding_df)} records')

In [None]:
# Clean and preprocess
from scalp2.data.preprocessing import clean_ohlcv

for tf in data:
    data[tf] = clean_ohlcv(data[tf], tf)
    print(f'{tf} after cleaning: {len(data[tf])} bars')

# Save cleaned data
for tf, df in data.items():
    path = f'{config.data.processed_dir}/BTC_USDT_{tf}_clean.parquet'
    df.to_parquet(path)
    print(f'Saved {path}')