In [1]:
import numpy as np
import pandas as pd
import requests
import datetime
import time
REQUEST_LIMIT_1MIN = 1200

In [4]:
def find_first_candlestick(symbol):
    
    info_url = f'https://api.binance.com/api/v3/klines?symbol={symbol}&interval=1m&limit=1&startTime=1'
    info_r = requests.get(info_url)
    info = info_r.json()
    df = pd.DataFrame(info)
    if not df.empty:
        df.columns =['timestamp', 'open', 'high', 'low', 'close', 'vol', 'ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore']
        df = df.drop(columns=['ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore'])
        ts = int(df.iloc[0].timestamp/1000)
        print(f"First Timestamp of {symbol} is at {datetime.datetime.fromtimestamp(ts)}")
        return ts
    return 0

def get_unix_ts(year, month, day, hour, minute):
    dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc)
    return int(datetime.datetime.timestamp(dt))

def get_binance_ohlc(symbol, start, limit=1000):
    start_ms = start * 1000
    info_url = f'https://api.binance.com/api/v3/klines?symbol={symbol}&interval=1m&limit={limit}&startTime={start_ms}'
    info_r = requests.get(info_url)
    info = info_r.json()
    current_limit = info_r.headers['x-mbx-used-weight-1m']
    
    
    data = pd.DataFrame(info)
    if not data.empty:
        data.columns =['timestamp', 'open', 'high', 'low', 'close', 'volume', 'ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore']
        data = data.drop(columns=['ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore'])    
        if len(data) > 0:
            data.timestamp = data.timestamp / 1000
            data['timestamp'] = data['timestamp'].astype(np.int)
            data['volume'] = data['volume'].astype(np.double)
            data['open'] = data['open'].astype(np.double)
            data['high'] = data['high'].astype(np.double)
            data['low'] = data['low'].astype(np.double)
            data['close'] = data['close'].astype(np.double)
            data = data.set_index('timestamp')
            return data, current_limit
    return pd.DataFrame(), current_limit

def append_data_to_file(file, df):
    df.to_csv(f"data/{file}", mode='a', header=False)
    
def load_year(symbol, year):
    print(f"{symbol} - {y}")
    start_time = get_unix_ts(year, 1, 1, 0, 0)
    if year == 2022:
        end_time = get_unix_ts(year,6,30,23,59)
    else:
        end_time = get_unix_ts(year,12,31,23,59)
    filename = f"binance_{symbol}_{year}.csv"
    load(symbol, start_time, end_time, filename)
    
    
# load specific dates
# load('btcusd', get_unix_ts(2011, 8, 10, 0, 0), get_unix_ts(2011,12,31,23,59), 'bitstamp_btcusd.csv')
def load(symbol, start, end, filename):
    current_start = start
    counter = 0
    skipped = 0
    added = 0
    last_time = time.time()
    total_start = time.time()
    while True:
        if current_start < end:
            new_data, current_limit = get_binance_ohlc(symbol, current_start)
            if new_data.empty:
                skipped += 1000
                current_start = current_start + (60*1000)
            else:                
                new_data = new_data[new_data.index <= end]
                last_timestamp = new_data.last_valid_index()
                if last_timestamp:
                    current_start = last_timestamp + 60                          
                    append_data_to_file(filename, new_data)
                    added += len(new_data)
                else:
                    skipped += 1000
                    current_start = current_start + (60*1000)
            counter += 1
            if counter % 43 == 0:
                current_time = time.time()
                time_diff = int(current_time - last_time)
                req_per_sec = int((43 / time_diff) * 10)/10
                limit_remaining = REQUEST_LIMIT_1MIN - int(current_limit)
                print(f"  iteration {counter:03} completed in {time_diff}s, avg {req_per_sec} r/sec, next={datetime.datetime.fromtimestamp(current_start)}, rate-limit={limit_remaining}")
                if limit_remaining < 50:
                    print("rate limit protection - sleep 30s")
                    time.sleep(30)
                last_time = current_time
        else:            
            break
    total_time = int(time.time() - total_start)
    print(f"{counter:03} iterations done in {total_time}s, skipped {skipped}, added {added}\n")

In [3]:
"""
SYMBOLS = ['ADAUSDT', 'AMBBTC','ARDRUSDT', 'ARDRBTC',
           'BNBUSDT','BNTUSDT', 'BNTBTC', 
           'DOGEUSDT', 'DOTUSDT', 
           'ETCUSDT', 'ETHUSDT', 
           'FUNBTC', 'FUNETH', 'FUNUSDT', 
           'GOBTC',
           'LINKUSDT',
           'MANAUSDT', 'MANABTC', 'MATICUSDT', 'MLNUSDT', 
           'NANOUSDT','NANOBTC',
           'SOLUSDT', 'SUSHIUSDT', 
           'XRPUSDT']
SYMBOLS = ['BTCUSDT', 'DOGEUSDT','FUNBTC','FUNUSDT']
"""
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']


# SYMBOLS = ['BTCUSDT']

startYear = None

for symbol in SYMBOLS:
    if startYear:
        first_year = startYear
    else:
        first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    print(f"Starting {symbol} in {first_year}")
    for y in range(first_year, 2022):
        load_year(symbol, y)

First Timestamp of BTCUSDT is at 2017-08-17 06:00:00
Starting BTCUSDT in 2017
BTCUSDT - 2017


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  iteration 043 completed in 15s, avg 2.8 r/sec, next=2017-09-16 09:40:00, rate-limit=1187
  iteration 086 completed in 15s, avg 2.8 r/sec, next=2017-10-16 06:20:00, rate-limit=1144
  iteration 129 completed in 16s, avg 2.6 r/sec, next=2017-11-15 02:00:00, rate-limit=1101
  iteration 172 completed in 20s, avg 2.1 r/sec, next=2017-12-14 22:39:20, rate-limit=1058
197 iterations done in 80s, skipped 0, added 196544

BTCUSDT - 2018
  iteration 043 completed in 18s, avg 2.3 r/sec, next=2018-01-30 23:45:00, rate-limit=1137
  iteration 086 completed in 16s, avg 2.6 r/sec, next=2018-03-03 06:44:00, rate-limit=1094
  iteration 129 completed in 20s, avg 2.1 r/sec, next=2018-04-02 04:24:00, rate-limit=1194
  iteration 172 completed in 16s, avg 2.6 r/sec, next=2018-05-02 01:04:00, rate-limit=1151
  iteration 215 completed in 16s, avg 2.6 r/sec, next=2018-05-31 21:44:00, rate-limit=1108
  iteration 258 completed in 22s, avg 1.9 r/sec, next=2018-07-01 06:09:00, rate-limit=1065
  iteration 301 comple

In [None]:
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
# SYMBOLS = ['BTCUSDT']
for symbol in SYMBOLS:
    # first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    for y in range(2022, 2023):
        load_year(symbol, y)

In [None]:
SYMBOLS = ['NEOUSDT', 'LTCUSDT', 'EOSUSDT', 'XLMUSDT', 'SHIBUSDT', 'IOTAUSDT', 'TRXUSDT', 'ETCUSDT', 'LINKUSDT', 'XMRUSDT', 'MATICUSDT', 'ALGOUSDT', 'MANAUSDT']

In [7]:
# SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
SYMBOLS = ['LTCUSDT', 'EOSUSDT', 'XLMUSDT', 'SHIBUSDT']
# SYMBOLS = ['BTCUSDT']
for symbol in SYMBOLS:
    first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    for y in range(first_year, 2023):
        load_year(symbol, y)

First Timestamp of LTCUSDT is at 2017-12-13 04:32:00
LTCUSDT - 2017


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


028 iterations done in 9s, skipped 0, added 27071

LTCUSDT - 2018
  iteration 043 completed in 20s, avg 2.1 r/sec, next=2018-01-30 23:45:00, rate-limit=1039
  iteration 086 completed in 16s, avg 2.6 r/sec, next=2018-03-03 06:45:00, rate-limit=906
  iteration 129 completed in 17s, avg 2.5 r/sec, next=2018-04-02 04:25:00, rate-limit=1142
  iteration 172 completed in 21s, avg 2.0 r/sec, next=2018-05-02 01:05:00, rate-limit=1000
  iteration 215 completed in 18s, avg 2.3 r/sec, next=2018-05-31 21:45:00, rate-limit=862
  iteration 258 completed in 17s, avg 2.5 r/sec, next=2018-07-01 06:10:00, rate-limit=1177
  iteration 301 completed in 20s, avg 2.1 r/sec, next=2018-07-31 10:27:00, rate-limit=1043
  iteration 344 completed in 16s, avg 2.6 r/sec, next=2018-08-30 07:07:00, rate-limit=923
  iteration 387 completed in 19s, avg 2.2 r/sec, next=2018-09-29 03:47:00, rate-limit=1198
  iteration 430 completed in 18s, avg 2.3 r/sec, next=2018-10-29 02:57:00, rate-limit=1051
  iteration 473 completed i