In [1]:
from binance.spot import Spot
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import os
from dateutil import parser
from dotenv import load_dotenv
import math
from tqdm.notebook import tqdm
import time

In [2]:
res = Spot().klines("BTCUSDT", "1m")

In [3]:
res_df = pd.DataFrame(res, columns = [
    "open_time", "open", "high", "low", "close", "volume", "close_time", "q_volume", "num_trades", 
    "taker_buy_a_vol", "taker_buy_q_vol", "ignore"
])
display(res_df.head(5))

Unnamed: 0,open_time,open,high,low,close,volume,close_time,q_volume,num_trades,taker_buy_a_vol,taker_buy_q_vol,ignore
0,1694240100000,25864.67,25864.73,25864.67,25864.73,3.64632,1694240159999,94310.9485678,267,2.35219,60838.6531177,0
1,1694240160000,25864.72,25864.73,25864.72,25864.72,2.3058,1694240219999,59638.8822863,235,1.09103,28219.1963719,0
2,1694240220000,25864.72,25864.73,25864.72,25864.73,1.57368,1694240279999,40702.8017944,195,0.92248,23859.6961304,0
3,1694240280000,25864.72,25864.73,25864.72,25864.72,1.1585,1694240339999,29964.2816857,174,0.35657,9222.5867761,0
4,1694240340000,25864.73,25864.73,25864.72,25864.73,0.81664,1694240399999,21122.1677415,115,0.28007,7243.9349311,0


In [4]:
df = res_df
df["open_time"] = pd.to_datetime(df["open_time"], unit = "ms")
df["close_time"] = pd.to_datetime(df["close_time"], unit = "ms")
display(df.head(5))

Unnamed: 0,open_time,open,high,low,close,volume,close_time,q_volume,num_trades,taker_buy_a_vol,taker_buy_q_vol,ignore
0,2023-09-09 06:15:00,25864.67,25864.73,25864.67,25864.73,3.64632,2023-09-09 06:15:59.999,94310.9485678,267,2.35219,60838.6531177,0
1,2023-09-09 06:16:00,25864.72,25864.73,25864.72,25864.72,2.3058,2023-09-09 06:16:59.999,59638.8822863,235,1.09103,28219.1963719,0
2,2023-09-09 06:17:00,25864.72,25864.73,25864.72,25864.73,1.57368,2023-09-09 06:17:59.999,40702.8017944,195,0.92248,23859.6961304,0
3,2023-09-09 06:18:00,25864.72,25864.73,25864.72,25864.72,1.1585,2023-09-09 06:18:59.999,29964.2816857,174,0.35657,9222.5867761,0
4,2023-09-09 06:19:00,25864.73,25864.73,25864.72,25864.73,0.81664,2023-09-09 06:19:59.999,21122.1677415,115,0.28007,7243.9349311,0


Retrieving new data using functions taken from [this Medium article](https://medium.com/swlh/retrieving-full-historical-data-for-every-cryptocurrency-on-binance-bitmex-using-the-python-apis-27b47fd8137f)

In [5]:
BINSIZES = {
    "1m": 1, 
    "5m": 5, 
    "1h": 60, 
    "1d": 1440
}

BATCH_SIZE = 1000
DATA_FOLDER_PATH = "./data"
binance_getter = Spot()

In [6]:
def get_times(symbol: str, data: pd.DataFrame, interval: str = "1m", source: str = "binance"):
    if len(data) > 0:
        old = parser.parse(data["timestamp"].iloc[-1])
    else:
        if source == "binance":
            old = datetime.datetime.strptime("01-01-2017", "%d-%m-%Y")
    if source == "binance":
        new = pd.to_datetime(binance_getter.klines(symbol, interval = interval)[-1][0], unit = "ms")
    return old, new    

In [11]:
def get_binance_data(symbol: str, interval: str = "1m", save: bool = False):
    filename = os.path.join(DATA_FOLDER_PATH, f"{symbol}-{interval}-data.csv")
    if os.path.isfile(filename):
        # if there is an existing file
        existing_df = pd.read_csv(filename)
    else:
        # if there are no existing files
        existing_df = pd.DataFrame()

    oldest, newest = get_times(symbol=symbol, data=existing_df, interval=interval, source="binance")
    delta_min = (newest - oldest).total_seconds()/60
    available_data = math.ceil(delta_min / BINSIZES[interval])
    
    if oldest == datetime.datetime.strptime("01-01-2017", "%d-%m-%Y"):
        print(f"Downloading all available data (i.e., from 01-01-2017)")
    else:
        print(f"Downloading data from {datetime.datetime.strftime(oldest, '%d-%m-%Y %H:%M:%S')} to {datetime.datetime.strftime(newest, '%d-%m-%Y %H:%M:%S')}")
    
    num_iters = math.ceil(available_data / BATCH_SIZE)
    ls_data = []

    if num_iters > 0:
        print(f"Currently iterating through")
        for num in tqdm(range(num_iters)):
            new_time = (oldest + datetime.timedelta(minutes = num * BATCH_SIZE * BINSIZES[interval]))
            cur_klines = binance_getter.klines(
                symbol=symbol, 
                interval=interval, 
                limit=BATCH_SIZE, 
                startTime=int((new_time - datetime.datetime(1970, 1, 1)).total_seconds()) * 1000 
            )

            cur_data = pd.DataFrame(cur_klines, columns = [
                'timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 
                'quote_av', 'trades', 'tb_base_av', 'tb_quote_av', 'ignore' 
            ])

            ls_data.append(cur_data)
    
    data = pd.concat(ls_data, ignore_index = True)

    data["timestamp"] = pd.to_datetime(data["timestamp"], unit = "ms")

    if len(existing_df) > 0:
        existing_df = pd.concat([existing_df, data], ignore_index = True)
    else:
        existing_df = data
    
    if save:
        existing_df.to_csv(filename)
    print(f"Data saved to {filename}")
    return existing_df

In [12]:
binance_data = get_binance_data(symbol="BTCUSDT", interval="1m", save=True)

Downloading all available data (i.e., from 01-01-2017)
Currently iterating through


  0%|          | 0/3518 [00:00<?, ?it/s]

Data saved to ./data/BTCUSDT-1m-data.csv
