In [None]:
import numpy as np
import pandas as pd
import requests
import datetime
import time
REQUEST_LIMIT_1MIN = 1200

In [None]:
def find_first_candlestick(symbol):
    
    info_url = f'https://api.binance.com/api/v3/klines?symbol={symbol}&interval=1m&limit=1&startTime=1'
    info_r = requests.get(info_url)
    info = info_r.json()
    df = pd.DataFrame(info)
    if not df.empty:
        df.columns =['timestamp', 'open', 'high', 'low', 'close', 'vol', 'ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore']
        df = df.drop(columns=['ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore'])
        ts = int(df.iloc[0].timestamp/1000)
        print(f"First Timestamp of {symbol} is at {datetime.datetime.fromtimestamp(ts)}")
        return ts
    return 0

def get_unix_ts(year, month, day, hour, minute):
    dt = datetime.datetime(year, month, day, hour, minute, tzinfo=datetime.timezone.utc)
    return int(datetime.datetime.timestamp(dt))

def get_binance_ohlc(symbol, start, limit=1000):
    start_ms = start * 1000
    info_url = f'https://api.binance.com/api/v3/klines?symbol={symbol}&interval=1m&limit={limit}&startTime={start_ms}'
    info_r = requests.get(info_url)
    info = info_r.json()
    current_limit = info_r.headers['x-mbx-used-weight-1m']
    
    
    data = pd.DataFrame(info)
    if not data.empty:
        data.columns =['timestamp', 'open', 'high', 'low', 'close', 'volume', 'ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore']
        data = data.drop(columns=['ct', 'qat', 'not', 'basevol', 'quotevol', 'ignore'])    
        if len(data) > 0:
            data.timestamp = data.timestamp / 1000
            data['timestamp'] = data['timestamp'].astype(int)
            data['volume'] = data['volume'].astype(np.double)
            data['open'] = data['open'].astype(np.double)
            data['high'] = data['high'].astype(np.double)
            data['low'] = data['low'].astype(np.double)
            data['close'] = data['close'].astype(np.double)
            data = data.set_index('timestamp')
            return data, current_limit
    return pd.DataFrame(), current_limit

def append_data_to_file(file, df):
    df.to_csv(f"data/thesis-step0/{file}", mode='a', header=False)
    
def load_year(symbol, year):
    print(f"{symbol} - {y}")
    start_time = get_unix_ts(year, 1, 1, 0, 0)
    if year == 2022:
        end_time = get_unix_ts(year,12,2,5,59)
    else:
        end_time = get_unix_ts(year,12,31,23,59)
    filename = f"binance_{symbol}_{year}.csv"
    load(symbol, start_time, end_time, filename)
    
    
# load specific dates
# load('btcusd', get_unix_ts(2011, 8, 10, 0, 0), get_unix_ts(2011,12,31,23,59), 'bitstamp_btcusd.csv')
def load(symbol, start, end, filename):
    current_start = start
    counter = 0
    skipped = 0
    added = 0
    last_time = time.time()
    total_start = time.time()
    while True:
        if current_start < end:
            new_data, current_limit = get_binance_ohlc(symbol, current_start)
            if new_data.empty:
                skipped += 1000
                current_start = current_start + (60*1000)
            else:                
                new_data = new_data[new_data.index <= end]
                last_timestamp = new_data.last_valid_index()
                if last_timestamp:
                    current_start = last_timestamp + 60                          
                    append_data_to_file(filename, new_data)
                    added += len(new_data)
                else:
                    skipped += 1000
                    current_start = current_start + (60*1000)
            counter += 1
            if counter % 43 == 0:
                current_time = time.time()
                time_diff = int(current_time - last_time)
                req_per_sec = int((43 / time_diff) * 10)/10
                limit_remaining = REQUEST_LIMIT_1MIN - int(current_limit)
                print(f"  iteration {counter:03} completed in {time_diff}s, avg {req_per_sec} r/sec, next={datetime.datetime.fromtimestamp(current_start)}, rate-limit={limit_remaining}")
                if limit_remaining < 50:
                    print("rate limit protection - sleep 30s")
                    time.sleep(30)
                last_time = current_time
        else:            
            break
    total_time = int(time.time() - total_start)
    print(f"{counter:03} iterations done in {total_time}s, skipped {skipped}, added {added}\n")

In [None]:
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'AVAXUSDT', 'LINKUSDT', 'UNIUSDT', 'TLMUSDT', 'AXSUSDT', 'DOGEUSDT', 'SHIBUSDT']
for symbol in SYMBOLS:
    first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    # print(f"Starting {symbol} in {first_year}")

In [None]:
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'AVAXUSDT', 'LINKUSDT', 'UNIUSDT', 'TLMUSDT', 'AXSUSDT', 'DOGEUSDT', 'SHIBUSDT']
# SYMBOLS = ['LTCUSDT', 'AVAXUSDT', 'LINKUSDT', 'UNIUSDT']
y = 2022
for symbol in SYMBOLS:
    load_year(symbol, y)

In [None]:
SYMBOLS = ['AVAXUSDT', 'LINKUSDT', 'UNIUSDT', 'TLMUSDT', 'AXSUSDT', 'DOGEUSDT', 'SHIBUSDT']
y = 2021
for symbol in SYMBOLS:
    load_year(symbol, y)

In [None]:
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'AVAXUSDT', 'LINKUSDT', 'UNIUSDT', 'TLMUSDT', 'AXSUSDT', 'DOGEUSDT', 'SHIBUSDT']

startYear = 2021

for symbol in SYMBOLS:
    if startYear:
        first_year = startYear
    else:
        first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    print(f"Starting {symbol} in {first_year}")
    for y in range(first_year, 2023):
        load_year(symbol, y)
        # pass

In [None]:
SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
# SYMBOLS = ['BTCUSDT']
for symbol in SYMBOLS:
    # first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    for y in range(2022, 2023):
        load_year(symbol, y)

In [None]:
SYMBOLS = ['NEOUSDT', 'LTCUSDT', 'EOSUSDT', 'XLMUSDT', 'SHIBUSDT', 'IOTAUSDT', 'TRXUSDT', 'ETCUSDT', 'LINKUSDT', 'XMRUSDT', 'MATICUSDT', 'ALGOUSDT', 'MANAUSDT']

In [None]:
# SYMBOLS = ['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'BNBUSDT', 'XRPUSDT', 'SOLUSDT', 'DOTUSDT', 'DOGEUSDT', 'AVAXUSDT', 'UNIUSDT']
SYMBOLS = ['TLMUSDT', 'AXSUSDT']
# SYMBOLS = ['BTCUSDT']
for symbol in SYMBOLS:
    first_year = datetime.datetime.fromtimestamp(find_first_candlestick(symbol)).year
    for y in range(first_year, 2023):
        load_year(symbol, y)