## Fetch Spot Data 

In [1]:
import csv
import os
import sys
from pathlib import Path

import ccxt
from dotenv import load_dotenv

# Add the parent directory to the system path
parent_dir = Path().resolve().parent
sys.path.append(str(parent_dir))

from utils.general import (check_missing_timestamps,
                           get_top_symbol_by_volume)

# Load environment variables
load_dotenv()


# Add the parent directory to the system path
parent_dir = Path().resolve().parent
sys.path.append(str(parent_dir))

# Print the ccxt library version
print(f"ccxt version: {ccxt.__version__}")

ccxt version: 4.4.40


In [2]:
def retry_fetch_ohlcv(exchange, max_retries, symbol, timeframe, since, limit):
    num_retries = 0
    try:
        num_retries += 1
        ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since, limit)
        # print('Fetched', len(ohlcv), symbol, 'candles from', exchange.iso8601 (ohlcv[0][0]), 'to', exchange.iso8601 (ohlcv[-1][0]))
        return ohlcv
    except Exception:
        if num_retries > max_retries:
            raise  # Exception('Failed to fetch', timeframe, symbol, 'OHLCV in', max_retries, 'attempts')


def scrape_ohlcv(exchange, max_retries, symbol, timeframe, since, limit):
    earliest_timestamp = exchange.milliseconds()
    timeframe_duration_in_seconds = exchange.parse_timeframe(timeframe)
    timeframe_duration_in_ms = timeframe_duration_in_seconds * 1000
    timedelta = limit * timeframe_duration_in_ms
    all_ohlcv = []
    while True:
        fetch_since = earliest_timestamp - timedelta
        ohlcv = retry_fetch_ohlcv(exchange, max_retries, symbol, timeframe, fetch_since, limit)
        # if we have reached the beginning of history
        if len(ohlcv)>0:
            if ohlcv[0][0] >= earliest_timestamp:
                break
        else:
            break
        earliest_timestamp = ohlcv[0][0]
        all_ohlcv = ohlcv + all_ohlcv
        print(len(all_ohlcv), symbol, 'candles in total from', exchange.iso8601(all_ohlcv[0][0]), 'to', exchange.iso8601(all_ohlcv[-1][0]))
        # if we have reached the checkpoint
        if fetch_since < since:
            break
    return all_ohlcv


def write_to_csv(filename, path_save, data):
    # Create the full path
    full_path = Path(path_save) / filename
    full_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
    # Write to the file
    with full_path.open('w+', newline='') as output_file:
        csv_writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerows(data)



def scrape_candles_to_csv(filename, exchange, max_retries, symbol, timeframe, since, limit, path_save):
    
    # Convert `since` to milliseconds if needed
    if isinstance(since, str):
        since = exchange.parse8601(since)
    # Preload all markets
    exchange.load_markets()
    # Fetch all candles
    ohlcv = scrape_ohlcv(exchange, max_retries, symbol, timeframe, since, limit)
    # Save them to CSV
    write_to_csv(filename, path_save, ohlcv)
    print(f"Saved {len(ohlcv)} candles from {exchange.iso8601(ohlcv[0][0])} to {exchange.iso8601(ohlcv[-1][0])} to {filename}")

## Fetch data from FUTURE

In [3]:
API_KEY = os.getenv('BITGET_API_KEY')
SECRET_KEY = os.getenv('BITGET_SECRET_KEY')
PASSWORD = os.getenv('BITGET_PASSWORD')
MARKET_TYPE = "future"
EXCHANGE_ID = "bitget"
exchange = getattr(ccxt, EXCHANGE_ID)({
'apiKey': API_KEY,
'secret': SECRET_KEY,
'password': PASSWORD,
'options': {
    'defaultType': MARKET_TYPE},
    'enableRateLimit': True
})

In [4]:
df_symbols = get_top_symbol_by_volume(exchange=exchange, pair_filter="/USDT:USDT", top_n=100)
df_symbols = df_symbols.reset_index(drop=True)
df_symbols.head(10)

Unnamed: 0,symbol,volume_24h,price
0,BTC/USDT:USDT,11286270000.0,98750.4
1,ETH/USDT:USDT,4311697000.0,3637.66
2,XRP/USDT:USDT,1293240000.0,2.3863
3,HIVE/USDT:USDT,301159900.0,0.546
4,SOL/USDT:USDT,212585600.0,214.87
5,DOGE/USDT:USDT,146370100.0,0.38507
6,MOCA/USDT:USDT,141865400.0,0.30361
7,UNI/USDT:USDT,97043630.0,15.004
8,ADA/USDT:USDT,78686020.0,1.0774
9,PEPE/USDT:USDT,77200250.0,2.1e-05


In [5]:
API_KEY = os.getenv('BITGET_API_KEY')
SECRET_KEY = os.getenv('BITGET_SECRET_KEY')
PASSWORD = os.getenv('BITGET_PASSWORD')
MARKET_TYPE = "future"
EXCHANGE_ID = "bitget"
PATH_SAVE = f"/home/ubuntu/project/finance/cex-market-analysis/src/data/{EXCHANGE_ID}/{MARKET_TYPE}/test/"
TIMEFRAME = "1m"
FROM_DATE_STR = "2025-01-01 00:00:00"
LIMIT = 200
SYMBOL = "MOCA/USDT:USDT"
FILENAME = SYMBOL.replace("/", "_") + f"_{TIMEFRAME}.csv"

scrape_candles_to_csv(filename=FILENAME, exchange=exchange,
                      max_retries=3, symbol=SYMBOL,
                      timeframe=TIMEFRAME, since=FROM_DATE_STR,
                      limit=LIMIT, path_save=PATH_SAVE)

200 MOCA/USDT:USDT candles in total from 2025-01-06T07:38:00.000Z to 2025-01-06T10:57:00.000Z
400 MOCA/USDT:USDT candles in total from 2025-01-06T04:19:00.000Z to 2025-01-06T10:57:00.000Z
600 MOCA/USDT:USDT candles in total from 2025-01-06T01:00:00.000Z to 2025-01-06T10:57:00.000Z
800 MOCA/USDT:USDT candles in total from 2025-01-05T21:41:00.000Z to 2025-01-06T10:57:00.000Z
1000 MOCA/USDT:USDT candles in total from 2025-01-05T18:22:00.000Z to 2025-01-06T10:57:00.000Z
1200 MOCA/USDT:USDT candles in total from 2025-01-05T15:03:00.000Z to 2025-01-06T10:57:00.000Z
1400 MOCA/USDT:USDT candles in total from 2025-01-05T11:44:00.000Z to 2025-01-06T10:57:00.000Z
1600 MOCA/USDT:USDT candles in total from 2025-01-05T08:25:00.000Z to 2025-01-06T10:57:00.000Z
1800 MOCA/USDT:USDT candles in total from 2025-01-05T05:06:00.000Z to 2025-01-06T10:57:00.000Z
2000 MOCA/USDT:USDT candles in total from 2025-01-05T01:47:00.000Z to 2025-01-06T10:57:00.000Z
2200 MOCA/USDT:USDT candles in total from 2025-01-04T2

In [6]:
import pandas as pd

df = pd.read_csv(f"{PATH_SAVE}/MOCA_USDT:USDT_1m.csv", header=None)
df = pd.DataFrame(df.values, columns=['date', 'open', 'high', 'low', 'close', 'volume'])
df['date'] = pd.to_datetime(df['date'], unit='ms')
missing = check_missing_timestamps(df, freq='1min')
df.set_index('date', inplace=True)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-12-31 22:17:00,0.2887,0.2887,0.2871,0.28718,125774.0
2024-12-31 22:18:00,0.28718,0.28799,0.2865,0.28651,89950.0
2024-12-31 22:19:00,0.28651,0.28867,0.28651,0.28835,118853.0
2024-12-31 22:20:00,0.28835,0.289,0.28822,0.28859,363589.0
2024-12-31 22:21:00,0.28859,0.28906,0.28786,0.28798,122497.0


## Check missing data

In [7]:
import natsort
import glob
import pandas as pd

PATH_SAVE = "/home/ubuntu/project/finance/cex-market-analysis/src/data/bitget/future"
files_path = natsort.natsorted(glob.glob(os.path.join(PATH_SAVE, "*.csv"), recursive=False))
for file in files_path:

    df = pd.read_csv(file, header=None)
    df = pd.DataFrame(df.values, columns=['date', 'open', 'high', 'low', 'close', 'volume'])
    df['date'] = pd.to_datetime(df['date'], unit='ms')
    missing = check_missing_timestamps(df, freq='1min')
    if not missing.empty:
        print("Missing timestamps:")
        print(file)

## Re Sample data

In [8]:
df = pd.read_csv("/home/ubuntu/project/finance/cex-market-analysis/src/data/bitget/future/PNUT_USDT:USDT_1m.csv", header=None)
df = pd.DataFrame(df.values, columns=['date', 'open', 'high', 'low', 'close', 'volume'])
df['date'] = pd.to_datetime(df['date'], unit='ms')
missing = check_missing_timestamps(df, freq='1min')
df.set_index('date', inplace=True)

In [9]:
df_future = df.resample('1h').agg({
    'open': 'first',     # First price in the 1-hour window (Open)
    'high': 'max',       # Maximum price in the 1-hour window (High)
    'low': 'min',        # Minimum price in the 1-hour window (Low)
    'close': 'last',     # Last price in the 1-hour window (Close)
    'volume': 'sum'      # Total volume in the 1-hour window
})