In [8]:
import glob
import pandas as pd
import numpy as np
import yfinance
import time
from binance.client import Client
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor

### Get BTC, ETH, BNB historical data from 2020-2025 at 5 min intervals

In [None]:
client = Client(api_key='', api_secret='')

def date_to_str(date):
    return date.strftime("%Y-%m-%d %H:%M:%S")

def fetch_and_save_year(symbol, interval, year_start, year_end):
    current_start = year_start
    all_klines = []
    while current_start < year_end:
        current_end = min(current_start + timedelta(days=3), year_end)
        klines = client.get_historical_klines(
            symbol=symbol,
            interval=interval,
            start_str=date_to_str(current_start),
            end_str=date_to_str(current_end)
        )
        if not klines:
            break
        all_klines.extend(klines)
        print(f"{symbol} {year_start.year}: Fetched {len(klines)} rows {current_start} to {current_end}")
        current_start = datetime.fromtimestamp(klines[-1][0] / 1000.0) + timedelta(minutes=5)
        time.sleep(0.5)
    if all_klines:
        df = pd.DataFrame(all_klines, columns=[
            'Open Time', 'Open', 'High', 'Low', 'Close', 'Volume',
            'Close Time', 'Quote Asset Volume', 'Number of Trades',
            'Taker Buy Base Asset Volume', 'Taker Buy Quote Asset Volume', 'Ignore'
        ])
        df['Open Time'] = pd.to_datetime(df['Open Time'], unit='ms')
        df['Close Time'] = pd.to_datetime(df['Close Time'], unit='ms')
        df['Ticker'] = symbol
        df = df.drop(columns=['Ignore'])
        filename = f"{symbol}_{year_start.year}.csv"
        df.to_csv(filename, index=False)
        print(f"{symbol}: Saved {len(df)} rows to {filename}")

def runner(symbol):
    for year in range(2020, 2025):
        year_start = datetime(year, 1, 1)
        year_end = datetime(year + 1, 1, 1)
        fetch_and_save_year(symbol, Client.KLINE_INTERVAL_5MINUTE, year_start, year_end)

symbols = ['BTCUSDT', 'ETHUSDT', 'BNBUSDT']
with ThreadPoolExecutor(max_workers=len(symbols)) as executor:
    executor.map(runner, symbols)

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


BTCUSDT 2020: Fetched 865 rows 2020-01-01 00:00:00 to 2020-01-04 00:00:00
ETHUSDT 2020: Fetched 865 rows 2020-01-01 00:00:00 to 2020-01-04 00:00:00
BNBUSDT 2020: Fetched 865 rows 2020-01-01 00:00:00 to 2020-01-04 00:00:00
BTCUSDT 2020: Fetched 865 rows 2020-01-04 08:05:00 to 2020-01-07 08:05:00
ETHUSDT 2020: Fetched 865 rows 2020-01-04 08:05:00 to 2020-01-07 08:05:00
BNBUSDT 2020: Fetched 865 rows 2020-01-04 08:05:00 to 2020-01-07 08:05:00
BTCUSDT 2020: Fetched 865 rows 2020-01-07 16:10:00 to 2020-01-10 16:10:00
ETHUSDT 2020: Fetched 865 rows 2020-01-07 16:10:00 to 2020-01-10 16:10:00
BNBUSDT 2020: Fetched 865 rows 2020-01-07 16:10:00 to 2020-01-10 16:10:00
BTCUSDT 2020: Fetched 865 rows 2020-01-11 00:15:00 to 2020-01-14 00:15:00
BNBUSDT 2020: Fetched 865 rows 2020-01-11 00:15:00 to 2020-01-14 00:15:00
ETHUSDT 2020: Fetched 865 rows 2020-01-11 00:15:00 to 2020-01-14 00:15:00
BTCUSDT 2020: Fetched 865 rows 2020-01-14 08:20:00 to 2020-01-17 08:20:00
BNBUSDT 2020: Fetched 865 rows 2020-01

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


ETHUSDT 2020: Fetched 727 rows 2020-12-29 11:30:00 to 2021-01-01 00:00:00
BNBUSDT: Saved 94762 rows to BNBUSDT_2020.csv
BTCUSDT: Saved 94762 rows to BTCUSDT_2020.csv


and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


BTCUSDT 2021: Fetched 865 rows 2021-01-01 00:00:00 to 2021-01-04 00:00:00
BNBUSDT 2021: Fetched 865 rows 2021-01-01 00:00:00 to 2021-01-04 00:00:00
ETHUSDT: Saved 94762 rows to ETHUSDT_2020.csv
ETHUSDT 2021: Fetched 865 rows 2021-01-01 00:00:00 to 2021-01-04 00:00:00
BTCUSDT 2021: Fetched 865 rows 2021-01-04 08:05:00 to 2021-01-07 08:05:00
BNBUSDT 2021: Fetched 865 rows 2021-01-04 08:05:00 to 2021-01-07 08:05:00
ETHUSDT 2021: Fetched 865 rows 2021-01-04 08:05:00 to 2021-01-07 08:05:00
BNBUSDT 2021: Fetched 865 rows 2021-01-07 16:10:00 to 2021-01-10 16:10:00
BTCUSDT 2021: Fetched 865 rows 2021-01-07 16:10:00 to 2021-01-10 16:10:00
ETHUSDT 2021: Fetched 865 rows 2021-01-07 16:10:00 to 2021-01-10 16:10:00
BTCUSDT 2021: Fetched 865 rows 2021-01-11 00:15:00 to 2021-01-14 00:15:00
BNBUSDT 2021: Fetched 865 rows 2021-01-11 00:15:00 to 2021-01-14 00:15:00
ETHUSDT 2021: Fetched 865 rows 2021-01-11 00:15:00 to 2021-01-14 00:15:00
BTCUSDT 2021: Fetched 865 rows 2021-01-14 08:20:00 to 2021-01-17 0

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


ETHUSDT 2021: Fetched 372 rows 2021-12-30 17:05:00 to 2022-01-01 00:00:00
BNBUSDT: Saved 94460 rows to BNBUSDT_2021.csv
BTCUSDT: Saved 94460 rows to BTCUSDT_2021.csv


and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


BTCUSDT 2022: Fetched 865 rows 2022-01-01 00:00:00 to 2022-01-04 00:00:00
BNBUSDT 2022: Fetched 865 rows 2022-01-01 00:00:00 to 2022-01-04 00:00:00
ETHUSDT: Saved 94460 rows to ETHUSDT_2021.csv
ETHUSDT 2022: Fetched 865 rows 2022-01-01 00:00:00 to 2022-01-04 00:00:00
BNBUSDT 2022: Fetched 865 rows 2022-01-04 08:05:00 to 2022-01-07 08:05:00
BTCUSDT 2022: Fetched 865 rows 2022-01-04 08:05:00 to 2022-01-07 08:05:00
ETHUSDT 2022: Fetched 865 rows 2022-01-04 08:05:00 to 2022-01-07 08:05:00
BNBUSDT 2022: Fetched 865 rows 2022-01-07 16:10:00 to 2022-01-10 16:10:00
BTCUSDT 2022: Fetched 865 rows 2022-01-07 16:10:00 to 2022-01-10 16:10:00
ETHUSDT 2022: Fetched 865 rows 2022-01-07 16:10:00 to 2022-01-10 16:10:00
BNBUSDT 2022: Fetched 865 rows 2022-01-11 00:15:00 to 2022-01-14 00:15:00
BTCUSDT 2022: Fetched 865 rows 2022-01-11 00:15:00 to 2022-01-14 00:15:00
ETHUSDT 2022: Fetched 865 rows 2022-01-11 00:15:00 to 2022-01-14 00:15:00
BNBUSDT 2022: Fetched 865 rows 2022-01-14 08:20:00 to 2022-01-17 0

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


BNBUSDT 2023: Fetched 865 rows 2023-01-01 00:00:00 to 2023-01-04 00:00:00
ETHUSDT 2023: Fetched 865 rows 2023-01-01 00:00:00 to 2023-01-04 00:00:00
BTCUSDT 2023: Fetched 865 rows 2023-01-01 00:00:00 to 2023-01-04 00:00:00
ETHUSDT 2023: Fetched 865 rows 2023-01-04 08:05:00 to 2023-01-07 08:05:00BNBUSDT 2023: Fetched 865 rows 2023-01-04 08:05:00 to 2023-01-07 08:05:00

BTCUSDT 2023: Fetched 865 rows 2023-01-04 08:05:00 to 2023-01-07 08:05:00
BNBUSDT 2023: Fetched 865 rows 2023-01-07 16:10:00 to 2023-01-10 16:10:00
BTCUSDT 2023: Fetched 865 rows 2023-01-07 16:10:00 to 2023-01-10 16:10:00
ETHUSDT 2023: Fetched 865 rows 2023-01-07 16:10:00 to 2023-01-10 16:10:00
BTCUSDT 2023: Fetched 865 rows 2023-01-11 00:15:00 to 2023-01-14 00:15:00
BNBUSDT 2023: Fetched 865 rows 2023-01-11 00:15:00 to 2023-01-14 00:15:00
ETHUSDT 2023: Fetched 865 rows 2023-01-11 00:15:00 to 2023-01-14 00:15:00
BNBUSDT 2023: Fetched 865 rows 2023-01-14 08:20:00 to 2023-01-17 08:20:00
ETHUSDT 2023: Fetched 865 rows 2023-01

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


ETHUSDT 2024: Fetched 865 rows 2024-01-01 00:00:00 to 2024-01-04 00:00:00BNBUSDT 2024: Fetched 865 rows 2024-01-01 00:00:00 to 2024-01-04 00:00:00

BTCUSDT 2024: Fetched 865 rows 2024-01-01 00:00:00 to 2024-01-04 00:00:00
BNBUSDT 2024: Fetched 865 rows 2024-01-04 08:05:00 to 2024-01-07 08:05:00
ETHUSDT 2024: Fetched 865 rows 2024-01-04 08:05:00 to 2024-01-07 08:05:00
BTCUSDT 2024: Fetched 865 rows 2024-01-04 08:05:00 to 2024-01-07 08:05:00
BNBUSDT 2024: Fetched 865 rows 2024-01-07 16:10:00 to 2024-01-10 16:10:00
BTCUSDT 2024: Fetched 865 rows 2024-01-07 16:10:00 to 2024-01-10 16:10:00
ETHUSDT 2024: Fetched 865 rows 2024-01-07 16:10:00 to 2024-01-10 16:10:00
BNBUSDT 2024: Fetched 865 rows 2024-01-11 00:15:00 to 2024-01-14 00:15:00
ETHUSDT 2024: Fetched 865 rows 2024-01-11 00:15:00 to 2024-01-14 00:15:00
BTCUSDT 2024: Fetched 865 rows 2024-01-11 00:15:00 to 2024-01-14 00:15:00
BTCUSDT 2024: Fetched 865 rows 2024-01-14 08:20:00 to 2024-01-17 08:20:00
BNBUSDT 2024: Fetched 865 rows 2024-01

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
  klines = client.get_historical_klines(


BNBUSDT 2024: Fetched 660 rows 2024-12-29 17:05:00 to 2025-01-01 00:00:00
ETHUSDT: Saved 94945 rows to ETHUSDT_2024.csv
BTCUSDT: Saved 94945 rows to BTCUSDT_2024.csv
BNBUSDT: Saved 94945 rows to BNBUSDT_2024.csv


In [9]:
files = glob.glob('RawData/*USDT_*.csv')
dfs = []
for file in files:
    df = pd.read_csv(file)
    if 'Ignore' in df.columns:
        df = df.drop(columns=['Ignore'])
    dfs.append(df)

full_df = pd.concat(dfs, ignore_index=True)
full_df['Open Time'] = pd.to_datetime(full_df['Open Time'])
full_df = full_df.sort_values('Open Time').reset_index(drop=True)

full_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ticker
0,2020-01-01 00:00:00,129.1600,129.1900,128.8000,128.8800,696.714120,2020-01-01 00:04:59.999,8.987871e+04,278,374.556600,4.831396e+04,ETHUSDT
1,2020-01-01 00:00:00,13.7159,13.7194,13.7004,13.7010,6609.470000,2020-01-01 00:04:59.999,9.059133e+04,134,1317.360000,1.805495e+04,BNBUSDT
2,2020-01-01 00:00:00,7195.2400,7196.2500,7178.6400,7179.7800,95.509133,2020-01-01 00:04:59.999,6.863171e+05,1127,32.773245,2.355373e+05,BTCUSDT
3,2020-01-01 00:05:00,13.7010,13.7105,13.7000,13.7079,5018.620000,2020-01-01 00:09:59.999,6.876644e+04,51,2537.450000,3.477050e+04,BNBUSDT
4,2020-01-01 00:05:00,128.8400,128.9500,128.6900,128.9300,724.370400,2020-01-01 00:09:59.999,9.330236e+04,298,277.021820,3.569125e+04,ETHUSDT
...,...,...,...,...,...,...,...,...,...,...,...,...
1420390,2024-12-31 23:55:00,702.5500,702.5900,702.2000,702.3000,262.079000,2024-12-31 23:59:59.999,1.840937e+05,849,171.118000,1.202054e+05,BNBUSDT
1420391,2024-12-31 23:55:00,3340.4900,3342.3200,3337.4300,3337.7800,264.432800,2024-12-31 23:59:59.999,8.831281e+05,1574,173.940700,5.809203e+05,ETHUSDT
1420392,2025-01-01 00:00:00,93576.0000,93702.1500,93537.5000,93661.2000,45.941600,2025-01-01 00:04:59.999,4.301907e+06,7448,16.879670,1.580504e+06,BTCUSDT
1420393,2025-01-01 00:00:00,702.3100,702.7900,701.4600,702.6000,396.748000,2025-01-01 00:04:59.999,2.785847e+05,1831,149.938000,1.052896e+05,BNBUSDT


### Append with raw technical features

In [10]:
features = []

for ticker in full_df['Ticker'].unique():
    df = full_df[full_df['Ticker'] == ticker].copy()
    df = df.sort_values('Open Time').reset_index(drop=True)
    
    for col in ['Open', 'High', 'Low', 'Close', 'Volume', 'Quote Asset Volume',
                'Taker Buy Base Asset Volume', 'Taker Buy Quote Asset Volume']:
        df[col] = pd.to_numeric(df[col])

    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
    df['pct_change'] = df['Close'].pct_change()
    df['Close-Open'] = df['Close'] - df['Open']
    df['Bar_Direction'] = np.sign(df['Close'] - df['Open'])
    df['Bar_Change_Direction'] = np.sign(df['Close'] - df['Close'].shift(1))

    for N in [6, 12]:
        df[f'roll_mean_ret_{N}'] = df['log_return'].rolling(N).mean()
        df[f'roll_std_ret_{N}'] = df['log_return'].rolling(N).std()
        df[f'roll_mean_close_{N}'] = df['Close'].rolling(N).mean()
        df[f'roll_std_close_{N}'] = df['Close'].rolling(N).std()
        df[f'roll_high_{N}'] = df['High'].rolling(N).max()
        df[f'roll_low_{N}'] = df['Low'].rolling(N).min()
        df[f'stoch_raw_{N}'] = (df['Close'] - df[f'roll_low_{N}']) / (df[f'roll_high_{N}'] - df[f'roll_low_{N}'] + 1e-9)
        df[f'roll_range_{N}'] = df['High'].rolling(N).max() - df['Low'].rolling(N).min()
        df[f'roll_mean_vol_{N}'] = df['Volume'].rolling(N).mean()
        df[f'vol_surge_{N}'] = df['Volume'] / (df[f'roll_mean_vol_{N}'] + 1e-9)
        df[f'roll_sum_signed_vol_{N}'] = (df['Volume'] * np.sign(df['Close'] - df['Close'].shift(1))).rolling(N).sum()

    # EMA raw components for MACD-like patterns
    df['EMA_6'] = df['Close'].ewm(span=6, adjust=False).mean()
    df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA_24'] = df['Close'].ewm(span=24, adjust=False).mean()

    # True range for ATR-like volatility
    prev_close = df['Close'].shift(1)
    tr1 = df['High'] - df['Low']
    tr2 = abs(df['High'] - prev_close)
    tr3 = abs(df['Low'] - prev_close)
    df['True_Range'] = np.maximum.reduce([tr1, tr2, tr3])

    # +DM and -DM for ADX-like direction
    up_move = df['High'] - df['High'].shift(1)
    down_move = df['Low'].shift(1) - df['Low']
    df['+DM'] = np.where((up_move > down_move) & (up_move > 0), up_move, 0)
    df['-DM'] = np.where((down_move > up_move) & (down_move > 0), down_move, 0)

    # Raw components for RSI-like features
    df['pos_return'] = np.where(df['Close'] > df['Close'].shift(1), df['Close'] - df['Close'].shift(1), 0)
    df['neg_return'] = np.where(df['Close'] < df['Close'].shift(1), df['Close'].shift(1) - df['Close'], 0)
    df['roll_pos_ret_6'] = df['pos_return'].rolling(6).mean()
    df['roll_neg_ret_6'] = df['neg_return'].rolling(6).mean()
    df['roll_pos_ret_12'] = df['pos_return'].rolling(12).mean()
    df['roll_neg_ret_12'] = df['neg_return'].rolling(12).mean()

    df['avg_trade_size'] = df['Volume'] / (df['Number of Trades'] + 1e-9)
    df['taker_buy_ratio'] = df['Taker Buy Base Asset Volume'] / (df['Volume'] + 1e-9)

    features.append(df)

features_df = pd.concat(features, ignore_index=True)
features_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,+DM,-DM,pos_return,neg_return,roll_pos_ret_6,roll_neg_ret_6,roll_pos_ret_12,roll_neg_ret_12,avg_trade_size,taker_buy_ratio
0,2020-01-01 00:00:00,129.16,129.19,128.80,128.88,696.71412,2020-01-01 00:04:59.999,8.987871e+04,278,374.55660,...,0.00,0.00,0.00,0.00,,,,,2.506166,0.537604
1,2020-01-01 00:05:00,128.84,128.95,128.69,128.93,724.37040,2020-01-01 00:09:59.999,9.330236e+04,298,277.02182,...,0.00,0.11,0.05,0.00,,,,,2.430773,0.382431
2,2020-01-01 00:10:00,128.93,128.98,128.80,128.91,462.49462,2020-01-01 00:14:59.999,5.960716e+04,203,223.12403,...,0.03,0.00,0.00,0.02,,,,,2.278299,0.482436
3,2020-01-01 00:15:00,128.91,129.05,128.84,128.85,360.50849,2020-01-01 00:19:59.999,4.647832e+04,224,223.46932,...,0.07,0.00,0.00,0.06,,,,,1.609413,0.619873
4,2020-01-01 00:20:00,128.85,128.91,128.69,128.72,1827.53848,2020-01-01 00:24:59.999,2.352866e+05,244,917.60510,...,0.00,0.15,0.00,0.13,,,,,7.489912,0.502099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420390,2024-12-31 23:40:00,93464.27,93504.00,93456.00,93490.87,23.44102,2024-12-31 23:44:59.999,2.191292e+06,4534,10.09488,...,0.00,0.02,26.61,0.00,19.975000,56.923333,34.813333,59.085000,0.005170,0.430650
1420391,2024-12-31 23:45:00,93490.86,93544.50,93484.30,93544.49,23.00051,2024-12-31 23:49:59.999,2.150716e+06,3875,16.48456,...,40.50,0.00,53.62,0.00,28.911667,48.836667,39.029167,59.085000,0.005936,0.716704
1420392,2024-12-31 23:50:00,93544.49,93702.15,93544.49,93646.96,20.99684,2024-12-31 23:54:59.999,1.966053e+06,5307,10.22470,...,157.65,0.00,102.47,0.00,45.990000,44.465000,47.568333,37.555833,0.003956,0.486964
1420393,2024-12-31 23:55:00,93646.97,93676.98,93576.00,93576.00,17.03553,2024-12-31 23:59:59.999,1.595066e+06,3808,10.38243,...,0.00,0.00,0.00,70.96,30.450000,56.291667,47.568333,40.305000,0.004474,0.609457


### Get Telegram channels messages from 2020-2025 (proxy for sentiment)

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.
