In [80]:
# Run this if facing issue with transformers then restart jupyter
# pip install torch torchvision torchaudio transformers

import glob
import pandas as pd
import numpy as np
import yfinance
import time
from binance.client import Client
from datetime import datetime, timedelta, timezone
from concurrent.futures import ThreadPoolExecutor
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, pipeline, BertForSequenceClassification, BertTokenizer
from telethon import TelegramClient, errors
import torch
import torch.nn as nn
from tqdm.auto import tqdm
import asyncio
import nest_asyncio
nest_asyncio.apply()
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import joblib
from torch.nn.utils import weight_norm

### Get BTC, ETH, BNB historical data from 2020-2025 at 5 min intervals

In [16]:
def fetch_and_save(symbol, interval, start_date, end_date):
    current_start = start_date
    all_klines = []

    while current_start < end_date:
        current_end = min(current_start + timedelta(days=3), end_date)
        klines = client.get_historical_klines(
            symbol=symbol,
            interval=interval,
            start_str=date_to_str(current_start),
            end_str=date_to_str(current_end)
        )
        if not klines:
            break

        all_klines.extend(klines)
        print(f"{symbol}: Fetched {len(klines)} rows {current_start} to {current_end}")
        current_start = datetime.fromtimestamp(klines[-1][0] / 1000.0) + timedelta(minutes=5)
        time.sleep(0.5)

    if all_klines:
        df = pd.DataFrame(all_klines, columns=[
            'Open Time', 'Open', 'High', 'Low', 'Close', 'Volume',
            'Close Time', 'Quote Asset Volume', 'Number of Trades',
            'Taker Buy Base Asset Volume', 'Taker Buy Quote Asset Volume', 'Ignore'
        ])
        df['Open Time'] = pd.to_datetime(df['Open Time'], unit='ms', utc=True, errors='coerce')
        df['Close Time'] = pd.to_datetime(df['Close Time'], unit='ms', utc=True, errors='coerce')
        df['Ticker'] = symbol
        df = df.drop(columns=['Ignore'])
        filename = f"{symbol}.csv"
        df.to_csv(f"RawData/{filename}", index=False)
        print(f"{symbol}: Saved {len(df)} rows to RawData/{filename}")
    else:
        print(f"{symbol}: No data fetched.")

def runner(symbol):
    start_date = datetime(2019, 12, 1)  # buffer before 2020
    end_date = datetime(2025, 5, 10)
    fetch_and_save(symbol, Client.KLINE_INTERVAL_5MINUTE, start_date, end_date)

symbols = ['BTCUSDT', 'ETHUSDT', 'BNBUSDT']
for symbol in symbols:
    runner(symbol)

BTCUSDT 2019: Fetched 865 rows 2019-12-01 00:00:00 to 2019-12-04 00:00:00
BTCUSDT 2019: Fetched 865 rows 2019-12-04 08:05:00 to 2019-12-07 08:05:00
BTCUSDT 2019: Fetched 865 rows 2019-12-07 16:10:00 to 2019-12-10 16:10:00
BTCUSDT 2019: Fetched 865 rows 2019-12-11 00:15:00 to 2019-12-14 00:15:00
BTCUSDT 2019: Fetched 865 rows 2019-12-14 08:20:00 to 2019-12-17 08:20:00
BTCUSDT 2019: Fetched 865 rows 2019-12-17 16:25:00 to 2019-12-20 16:25:00
BTCUSDT 2019: Fetched 865 rows 2019-12-21 00:30:00 to 2019-12-24 00:30:00
BTCUSDT 2019: Fetched 865 rows 2019-12-24 08:35:00 to 2019-12-27 08:35:00
BTCUSDT 2019: Fetched 865 rows 2019-12-27 16:40:00 to 2019-12-30 16:40:00
BTCUSDT 2019: Fetched 865 rows 2019-12-31 00:45:00 to 2020-01-03 00:45:00
BTCUSDT 2019: Fetched 865 rows 2020-01-03 08:50:00 to 2020-01-06 08:50:00
BTCUSDT 2019: Fetched 865 rows 2020-01-06 16:55:00 to 2020-01-09 16:55:00
BTCUSDT 2019: Fetched 865 rows 2020-01-10 01:00:00 to 2020-01-13 01:00:00
BTCUSDT 2019: Fetched 865 rows 2020-01

In [19]:
files = glob.glob('RawData/*USDT*.csv')
dfs = []
for file in files:
    df = pd.read_csv(file)
    if 'Ignore' in df.columns:
        df = df.drop(columns=['Ignore'])
    dfs.append(df)

full_df = pd.concat(dfs, ignore_index=True)
full_df['Open Time'] = pd.to_datetime(full_df['Open Time'], utc=True)
full_df = full_df.drop_duplicates()
full_df = full_df.sort_values('Open Time').reset_index(drop=True)

full_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ticker
0,2019-12-01 00:00:00+00:00,15.7030,15.7139,15.6500,15.6747,6413.480000,2019-12-01 00:04:59.999000+00:00,1.004906e+05,123,4484.230000,7.026164e+04,BNBUSDT
1,2019-12-01 00:00:00+00:00,7540.6300,7541.8500,7530.0000,7532.2300,84.165204,2019-12-01 00:04:59.999000+00:00,6.342718e+05,801,30.139058,2.271708e+05,BTCUSDT
2,2019-12-01 00:00:00+00:00,151.4300,151.4300,151.0000,151.0000,1169.242100,2019-12-01 00:04:59.999000+00:00,1.767300e+05,344,491.054140,7.423493e+04,ETHUSDT
3,2019-12-01 00:05:00+00:00,15.6663,15.6846,15.6306,15.6408,2992.330000,2019-12-01 00:09:59.999000+00:00,4.686540e+04,108,1763.080000,2.762296e+04,BNBUSDT
4,2019-12-01 00:05:00+00:00,7530.8500,7533.5100,7519.0000,7524.4600,182.159347,2019-12-01 00:09:59.999000+00:00,1.371344e+06,1239,92.656946,6.975391e+05,BTCUSDT
...,...,...,...,...,...,...,...,...,...,...,...,...
1544311,2025-05-09 23:55:00+00:00,666.6700,667.4600,666.2900,666.2900,1781.472000,2025-05-09 23:59:59.999000+00:00,1.188150e+06,4065,794.530000,5.299370e+05,BNBUSDT
1544312,2025-05-09 23:55:00+00:00,2342.6500,2346.6600,2341.9300,2345.0400,734.368100,2025-05-09 23:59:59.999000+00:00,1.722010e+06,3802,362.822200,8.507275e+05,ETHUSDT
1544313,2025-05-10 00:00:00+00:00,2345.0400,2353.2700,2340.7700,2353.1700,2756.061200,2025-05-10 00:04:59.999000+00:00,6.470580e+06,9468,1327.033000,3.116609e+06,ETHUSDT
1544314,2025-05-10 00:00:00+00:00,666.2900,668.0400,665.3000,666.9200,2605.414000,2025-05-10 00:04:59.999000+00:00,1.736511e+06,8790,1330.963000,8.870441e+05,BNBUSDT


### Append with raw technical features

In [20]:
features = []

for ticker in full_df['Ticker'].unique():
    df = full_df[full_df['Ticker'] == ticker].copy()
    df = df.sort_values('Open Time').reset_index(drop=True)
    
    for col in ['Open', 'High', 'Low', 'Close', 'Volume', 'Quote Asset Volume',
                'Taker Buy Base Asset Volume', 'Taker Buy Quote Asset Volume']:
        df[col] = pd.to_numeric(df[col])

    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
    df['pct_change'] = df['Close'].pct_change()
    df['Close-Open'] = df['Close'] - df['Open']
    df['Bar_Direction'] = np.sign(df['Close'] - df['Open'])
    df['Bar_Change_Direction'] = np.sign(df['Close'] - df['Close'].shift(1))

    for N in [6, 12]:
        df[f'roll_mean_ret_{N}'] = df['log_return'].rolling(N).mean()
        df[f'roll_std_ret_{N}'] = df['log_return'].rolling(N).std()
        df[f'roll_mean_close_{N}'] = df['Close'].rolling(N).mean()
        df[f'roll_std_close_{N}'] = df['Close'].rolling(N).std()
        df[f'roll_high_{N}'] = df['High'].rolling(N).max()
        df[f'roll_low_{N}'] = df['Low'].rolling(N).min()
        df[f'stoch_raw_{N}'] = (df['Close'] - df[f'roll_low_{N}']) / (df[f'roll_high_{N}'] - df[f'roll_low_{N}'] + 1e-9)
        df[f'roll_range_{N}'] = df['High'].rolling(N).max() - df['Low'].rolling(N).min()
        df[f'roll_mean_vol_{N}'] = df['Volume'].rolling(N).mean()
        df[f'vol_surge_{N}'] = df['Volume'] / (df[f'roll_mean_vol_{N}'] + 1e-9)
        df[f'roll_sum_signed_vol_{N}'] = (df['Volume'] * np.sign(df['Close'] - df['Close'].shift(1))).rolling(N).sum()

    # EMA raw components for MACD-like patterns
    df['EMA_6'] = df['Close'].ewm(span=6, adjust=False).mean()
    df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA_24'] = df['Close'].ewm(span=24, adjust=False).mean()

    # True range for ATR-like volatility
    prev_close = df['Close'].shift(1)
    tr1 = df['High'] - df['Low']
    tr2 = abs(df['High'] - prev_close)
    tr3 = abs(df['Low'] - prev_close)
    df['True_Range'] = np.maximum.reduce([tr1, tr2, tr3])

    # +DM and -DM for ADX-like direction
    up_move = df['High'] - df['High'].shift(1)
    down_move = df['Low'].shift(1) - df['Low']
    df['+DM'] = np.where((up_move > down_move) & (up_move > 0), up_move, 0)
    df['-DM'] = np.where((down_move > up_move) & (down_move > 0), down_move, 0)

    # Raw components for RSI-like features
    df['pos_return'] = np.where(df['Close'] > df['Close'].shift(1), df['Close'] - df['Close'].shift(1), 0)
    df['neg_return'] = np.where(df['Close'] < df['Close'].shift(1), df['Close'].shift(1) - df['Close'], 0)
    df['roll_pos_ret_6'] = df['pos_return'].rolling(6).mean()
    df['roll_neg_ret_6'] = df['neg_return'].rolling(6).mean()
    df['roll_pos_ret_12'] = df['pos_return'].rolling(12).mean()
    df['roll_neg_ret_12'] = df['neg_return'].rolling(12).mean()

    df['avg_trade_size'] = df['Volume'] / (df['Number of Trades'] + 1e-9)
    df['taker_buy_ratio'] = df['Taker Buy Base Asset Volume'] / (df['Volume'] + 1e-9)

    features.append(df)

features_df = pd.concat(features, ignore_index=True)
start_2020 = pd.Timestamp('2020-01-01', tz='UTC')
features_df = features_df[features_df['Open Time'] >= start_2020].reset_index(drop=True)
features_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,+DM,-DM,pos_return,neg_return,roll_pos_ret_6,roll_neg_ret_6,roll_pos_ret_12,roll_neg_ret_12,avg_trade_size,taker_buy_ratio
0,2020-01-01 00:00:00+00:00,13.7159,13.7194,13.7004,13.7010,6609.4700,2020-01-01 00:04:59.999000+00:00,9.059133e+04,134,1317.3600,...,0.0000,0.0118,0.0000,0.0151,0.006233,0.007217,0.004492,0.004325,49.324403,0.199314
1,2020-01-01 00:05:00+00:00,13.7010,13.7105,13.7000,13.7079,5018.6200,2020-01-01 00:09:59.999000+00:00,6.876644e+04,51,2537.4500,...,0.0000,0.0004,0.0069,0.0000,0.007383,0.006917,0.004983,0.004325,98.404314,0.505607
2,2020-01-01 00:10:00+00:00,13.7125,13.7182,13.7000,13.7041,4358.3200,2020-01-01 00:14:59.999000+00:00,5.972965e+04,64,2996.2300,...,0.0077,0.0000,0.0000,0.0038,0.004950,0.007550,0.004117,0.004642,68.098750,0.687474
3,2020-01-01 00:15:00+00:00,13.7051,13.7172,13.7008,13.7027,3445.1400,2020-01-01 00:19:59.999000+00:00,4.721789e+04,85,2810.7600,...,0.0000,0.0000,0.0000,0.0014,0.003233,0.007783,0.004117,0.004683,40.531059,0.815862
4,2020-01-01 00:20:00+00:00,13.7024,13.7024,13.7000,13.7002,8187.3900,2020-01-01 00:24:59.999000+00:00,1.121783e+05,61,6360.2300,...,0.0000,0.0008,0.0000,0.0025,0.001150,0.008200,0.004117,0.004458,134.219508,0.776832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520119,2025-05-09 23:40:00+00:00,2341.2100,2341.2100,2338.1400,2339.2000,686.3044,2025-05-09 23:44:59.999000+00:00,1.605359e+06,2561,217.0930,...,0.0000,0.0000,0.0000,2.0100,0.476667,0.873333,1.145000,1.148333,0.267983,0.316322
1520120,2025-05-09 23:45:00+00:00,2339.2000,2342.0000,2338.8400,2341.9200,447.1902,2025-05-09 23:49:59.999000+00:00,1.046555e+06,2847,261.6661,...,0.7900,0.0000,2.7200,0.0000,0.930000,0.676667,1.371667,0.842500,0.157074,0.585134
1520121,2025-05-09 23:50:00+00:00,2341.9300,2342.7000,2339.4300,2342.6500,639.5902,2025-05-09 23:54:59.999000+00:00,1.497192e+06,3251,367.2641,...,0.7000,0.0000,0.7300,0.0000,0.581667,0.676667,1.432500,0.725000,0.196736,0.574218
1520122,2025-05-09 23:55:00+00:00,2342.6500,2346.6600,2341.9300,2345.0400,734.3681,2025-05-09 23:59:59.999000+00:00,1.722010e+06,3802,362.8222,...,3.9600,0.0000,2.3900,0.0000,0.978333,0.676667,1.605833,0.725000,0.193153,0.494060


In [21]:
features_df.isna().sum()

Open Time                       0
Open                            0
High                            0
Low                             0
Close                           0
Volume                          0
Close Time                      0
Quote Asset Volume              0
Number of Trades                0
Taker Buy Base Asset Volume     0
Taker Buy Quote Asset Volume    0
Ticker                          0
log_return                      0
pct_change                      0
Close-Open                      0
Bar_Direction                   0
Bar_Change_Direction            0
roll_mean_ret_6                 0
roll_std_ret_6                  0
roll_mean_close_6               0
roll_std_close_6                0
roll_high_6                     0
roll_low_6                      0
stoch_raw_6                     0
roll_range_6                    0
roll_mean_vol_6                 0
vol_surge_6                     0
roll_sum_signed_vol_6           0
roll_mean_ret_12                0
roll_std_ret_1

### Get Telegram channels messages from 2020-2025 (proxy for sentiment)

In [3]:
api_id = 25170345
api_hash = '782970020305efee143d83f63103a9d9'
session_name = 'session_name'

channels = ['cointelegraph', 'wublockchainenglish', 'watcherguru']
start_date = datetime(2020, 1, 1, tzinfo=timezone.utc)
end_date = datetime(2025, 5, 10, tzinfo=timezone.utc)

client = TelegramClient(session_name, api_id, api_hash)

async def scrape_channel(channel_username, start_date, end_date):
    if not client.is_connected():
        await client.start()

    messages = []

    try:
        async for msg in client.iter_messages(channel_username, offset_date=end_date):
            if msg.date < start_date:
                break
            if msg.message:
                messages.append({
                    'channel': channel_username,
                    'text': msg.message,
                    'timestamp': msg.date
                })
            if len(messages) % 500 == 0:
                print(f"{channel_username}: Collected {len(messages)} messages so far...")

    except errors.FloodWaitError as e:
        print(f"{channel_username}: Flood wait hit. Sleeping {e.seconds} seconds...")
        await asyncio.sleep(e.seconds + 1)
        return await scrape_channel(channel_username, start_date, end_date)

    except errors.ConnectionError:
        print(f"{channel_username}: Disconnected. Reconnecting...")
        await client.connect()
        return await scrape_channel(channel_username, start_date, end_date)

    print(f"{channel_username}: Total messages collected: {len(messages)}")
    return pd.DataFrame(messages)

async def main():
    dfs = []
    for channel in channels:
        df = await scrape_channel(channel, start_date, end_date)
        dfs.append(df)
    
    tele_df = pd.concat(dfs, ignore_index=True)
    return tele_df

tele_df = await main()

cointelegraph: Collected 500 messages so far...
cointelegraph: Collected 500 messages so far...
cointelegraph: Collected 1000 messages so far...
cointelegraph: Collected 1000 messages so far...
cointelegraph: Collected 1500 messages so far...
cointelegraph: Collected 1500 messages so far...
cointelegraph: Collected 2000 messages so far...
cointelegraph: Collected 2500 messages so far...
cointelegraph: Collected 3000 messages so far...
cointelegraph: Collected 3000 messages so far...
cointelegraph: Collected 3500 messages so far...
cointelegraph: Collected 3500 messages so far...
cointelegraph: Collected 4000 messages so far...
cointelegraph: Collected 4500 messages so far...
cointelegraph: Collected 5000 messages so far...
cointelegraph: Collected 5500 messages so far...
cointelegraph: Collected 6000 messages so far...
cointelegraph: Collected 6500 messages so far...
cointelegraph: Collected 7000 messages so far...
cointelegraph: Collected 7500 messages so far...
cointelegraph: Collect

Server closed the connection: 0 bytes read on a total of 8 expected bytes
Error executing high-level request after reconnect: <class 'sqlite3.OperationalError'>: database is locked
Server closed the connection: 0 bytes read on a total of 8 expected bytes
Error executing high-level request after reconnect: <class 'sqlite3.OperationalError'>: database is locked


In [None]:
tele_df = tele_df.sort_values('timestamp').reset_index(drop=True)
end_date_2020 = pd.Timestamp('2025-05-10', tz='UTC')
tele_df = tele_df[tele_df['timestamp'] <= end_date_2020].reset_index(drop=True)

tele_df.to_csv(f'RawData/tele_df.csv', index=False)
tele_df

Unnamed: 0,channel,text,timestamp
0,cointelegraph,​​Happy New Year from the Cointelegraph team! ...,2020-01-01 00:01:14+00:00
1,cointelegraph,⁠Wells Fargo analyst Mike Mayo predicts that t...,2020-01-01 00:26:34+00:00
2,cointelegraph,⁠A review of people who prophasized on BTC pri...,2020-01-01 01:00:04+00:00
3,cointelegraph,⁠John McAfee provides light relief as Bitcoin ...,2020-01-01 09:09:59+00:00
4,cointelegraph,"⁠Academic explains why no other currencies, fi...",2020-01-01 12:18:05+00:00
...,...,...,...
65347,wublockchainenglish,"According to the latest 13F filing, as of Marc...",2025-05-09 20:37:02+00:00
65348,wublockchainenglish,Coinbase CEO Brian Armstrong stated that the c...,2025-05-09 20:51:05+00:00
65349,wublockchainenglish,"According to Bloomberg, the Trump administrati...",2025-05-09 21:11:56+00:00
65350,cointelegraph,🚨 NEW: Why is Tether refusing to comply with M...,2025-05-09 21:35:39+00:00


### Get Sentiment Scores (using pretrained Cryptobert)

In [25]:
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")

def get_sentiment_scores(texts, batch_size=16):
    all_probs = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Scoring sentiment"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1).numpy()

        all_probs.extend(probs)

    return all_probs

probs = get_sentiment_scores(tele_df['text'].tolist())

tele_df['bullish_prob'] = [p[0] for p in probs]
tele_df['bearish_prob'] = [p[1] for p in probs]
tele_df['neutral_prob'] = [p[2] for p in probs]

print(tele_df.head())


Scoring sentiment:   0%|          | 0/4085 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Scoring sentiment: 100%|██████████| 4085/4085 [41:59<00:00,  1.62it/s]

         channel                                               text  \
0  cointelegraph  ​​Happy New Year from the Cointelegraph team! ...   
1  cointelegraph  ⁠Wells Fargo analyst Mike Mayo predicts that t...   
2  cointelegraph  ⁠A review of people who prophasized on BTC pri...   
3  cointelegraph  ⁠John McAfee provides light relief as Bitcoin ...   
4  cointelegraph  ⁠Academic explains why no other currencies, fi...   

                  timestamp  bullish_prob  bearish_prob  neutral_prob  
0 2020-01-01 00:01:14+00:00      0.000140      0.500064      0.499796  
1 2020-01-01 00:26:34+00:00      0.000120      0.750730      0.249150  
2 2020-01-01 01:00:04+00:00      0.000185      0.938967      0.060848  
3 2020-01-01 09:09:59+00:00      0.000416      0.907637      0.091948  
4 2020-01-01 12:18:05+00:00      0.001047      0.878411      0.120542  





In [None]:
tele_df = tele_df.sort_values('timestamp').reset_index(drop=True)
tele_df

# tele_df.to_csv(f'RawData/tele_sentiment_df.csv', index=False)

Unnamed: 0,channel,text,timestamp,bullish_prob,bearish_prob,neutral_prob
0,cointelegraph,​​Happy New Year from the Cointelegraph team! ...,2020-01-01 00:01:14+00:00,0.000140,0.500064,0.499796
1,cointelegraph,⁠Wells Fargo analyst Mike Mayo predicts that t...,2020-01-01 00:26:34+00:00,0.000120,0.750730,0.249150
2,cointelegraph,⁠A review of people who prophasized on BTC pri...,2020-01-01 01:00:04+00:00,0.000185,0.938967,0.060848
3,cointelegraph,⁠John McAfee provides light relief as Bitcoin ...,2020-01-01 09:09:59+00:00,0.000416,0.907637,0.091948
4,cointelegraph,"⁠Academic explains why no other currencies, fi...",2020-01-01 12:18:05+00:00,0.001047,0.878411,0.120542
...,...,...,...,...,...,...
65347,wublockchainenglish,"According to the latest 13F filing, as of Marc...",2025-05-09 20:37:02+00:00,0.000511,0.789709,0.209781
65348,wublockchainenglish,Coinbase CEO Brian Armstrong stated that the c...,2025-05-09 20:51:05+00:00,0.000837,0.766005,0.233158
65349,wublockchainenglish,"According to Bloomberg, the Trump administrati...",2025-05-09 21:11:56+00:00,0.000185,0.909061,0.090755
65350,cointelegraph,🚨 NEW: Why is Tether refusing to comply with M...,2025-05-09 21:35:39+00:00,0.056685,0.169183,0.774132


In [None]:
def attach_sentiment_to_features(features_df, tele_df):
    agg_results = []

    for _, row in features_df.iterrows():
        mask = (tele_df['timestamp'] >= row['Open Time']) & (tele_df['timestamp'] <= row['Close Time'])
        subset = tele_df[mask]

        if len(subset) > 0:
            agg_results.append({
                'mean_bullish_prob': subset['bullish_prob'].mean(),
                'mean_bearish_prob': subset['bearish_prob'].mean(),
                'mean_neutral_prob': subset['neutral_prob'].mean(),
                'msg_count': len(subset)
            })
        else:
            agg_results.append({
                'mean_bullish_prob': float('nan'),
                'mean_bearish_prob': float('nan'),
                'mean_neutral_prob': float('nan'),
                'msg_count': 0
            })

    sentiment_df = pd.DataFrame(agg_results)
    final_df = pd.concat([features_df.reset_index(drop=True), sentiment_df], axis=1)
    return final_df

final_df = attach_sentiment_to_features(features_df, tele_df)

In [34]:
final_df[['mean_bullish_prob', 'mean_bearish_prob', 'mean_neutral_prob']] = (
    final_df[['mean_bullish_prob', 'mean_bearish_prob', 'mean_neutral_prob']].ffill()
)
final_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,roll_pos_ret_6,roll_neg_ret_6,roll_pos_ret_12,roll_neg_ret_12,avg_trade_size,taker_buy_ratio,mean_bullish_prob,mean_bearish_prob,mean_neutral_prob,msg_count
0,2020-01-01 00:00:00+00:00,13.7159,13.7194,13.7004,13.7010,6609.4700,2020-01-01 00:04:59.999000+00:00,9.059133e+04,134,1317.3600,...,0.006233,0.007217,0.004492,0.004325,49.324403,0.199314,0.000140,0.500064,0.499796,1
1,2020-01-01 00:05:00+00:00,13.7010,13.7105,13.7000,13.7079,5018.6200,2020-01-01 00:09:59.999000+00:00,6.876644e+04,51,2537.4500,...,0.007383,0.006917,0.004983,0.004325,98.404314,0.505607,0.000140,0.500064,0.499796,0
2,2020-01-01 00:10:00+00:00,13.7125,13.7182,13.7000,13.7041,4358.3200,2020-01-01 00:14:59.999000+00:00,5.972965e+04,64,2996.2300,...,0.004950,0.007550,0.004117,0.004642,68.098750,0.687474,0.000140,0.500064,0.499796,0
3,2020-01-01 00:15:00+00:00,13.7051,13.7172,13.7008,13.7027,3445.1400,2020-01-01 00:19:59.999000+00:00,4.721789e+04,85,2810.7600,...,0.003233,0.007783,0.004117,0.004683,40.531059,0.815862,0.000140,0.500064,0.499796,0
4,2020-01-01 00:20:00+00:00,13.7024,13.7024,13.7000,13.7002,8187.3900,2020-01-01 00:24:59.999000+00:00,1.121783e+05,61,6360.2300,...,0.001150,0.008200,0.004117,0.004458,134.219508,0.776832,0.000140,0.500064,0.499796,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520119,2025-05-09 23:40:00+00:00,2341.2100,2341.2100,2338.1400,2339.2000,686.3044,2025-05-09 23:44:59.999000+00:00,1.605359e+06,2561,217.0930,...,0.476667,0.873333,1.145000,1.148333,0.267983,0.316322,0.000057,0.818578,0.181365,0
1520120,2025-05-09 23:45:00+00:00,2339.2000,2342.0000,2338.8400,2341.9200,447.1902,2025-05-09 23:49:59.999000+00:00,1.046555e+06,2847,261.6661,...,0.930000,0.676667,1.371667,0.842500,0.157074,0.585134,0.000057,0.818578,0.181365,0
1520121,2025-05-09 23:50:00+00:00,2341.9300,2342.7000,2339.4300,2342.6500,639.5902,2025-05-09 23:54:59.999000+00:00,1.497192e+06,3251,367.2641,...,0.581667,0.676667,1.432500,0.725000,0.196736,0.574218,0.000057,0.818578,0.181365,0
1520122,2025-05-09 23:55:00+00:00,2342.6500,2346.6600,2341.9300,2345.0400,734.3681,2025-05-09 23:59:59.999000+00:00,1.722010e+06,3802,362.8222,...,0.978333,0.676667,1.605833,0.725000,0.193153,0.494060,0.000057,0.818578,0.181365,0


### Target variable labelling

In [39]:
final_df.columns

Index(['Open Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close Time',
       'Quote Asset Volume', 'Number of Trades', 'Taker Buy Base Asset Volume',
       'Taker Buy Quote Asset Volume', 'Ticker', 'log_return', 'pct_change',
       'Close-Open', 'Bar_Direction', 'Bar_Change_Direction',
       'roll_mean_ret_6', 'roll_std_ret_6', 'roll_mean_close_6',
       'roll_std_close_6', 'roll_high_6', 'roll_low_6', 'stoch_raw_6',
       'roll_range_6', 'roll_mean_vol_6', 'vol_surge_6',
       'roll_sum_signed_vol_6', 'roll_mean_ret_12', 'roll_std_ret_12',
       'roll_mean_close_12', 'roll_std_close_12', 'roll_high_12',
       'roll_low_12', 'stoch_raw_12', 'roll_range_12', 'roll_mean_vol_12',
       'vol_surge_12', 'roll_sum_signed_vol_12', 'EMA_6', 'EMA_12', 'EMA_24',
       'True_Range', '+DM', '-DM', 'pos_return', 'neg_return',
       'roll_pos_ret_6', 'roll_neg_ret_6', 'roll_pos_ret_12',
       'roll_neg_ret_12', 'avg_trade_size', 'taker_buy_ratio',
       'mean_bullish_prob', 'mean_

In [40]:
# splitting datasets, to decide which coin to use to trade
btc_final_df = final_df[final_df['Ticker'] == 'BTCUSDT'].copy()

horizon = 1
threshold = 0.001  # make a move only when 0.1% change in return is predicted

btc_final_df['future_return'] = btc_final_df['Close'].shift(-horizon) / final_df['Close'] - 1
btc_final_df['target'] = (btc_final_df['future_return'] > threshold).astype(int)
btc_final_df

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,roll_pos_ret_12,roll_neg_ret_12,avg_trade_size,taker_buy_ratio,mean_bullish_prob,mean_bearish_prob,mean_neutral_prob,msg_count,future_return,target
506708,2020-01-01 00:00:00+00:00,7195.24,7196.25,7178.64,7179.78,95.509133,2020-01-01 00:04:59.999000+00:00,6.863171e+05,1127,32.773245,...,2.090833,3.322500,0.084746,0.343143,0.000140,0.500064,0.499796,1,1.572472e-03,1
506709,2020-01-01 00:05:00+00:00,7179.76,7191.77,7178.20,7191.07,59.365225,2020-01-01 00:09:59.999000+00:00,4.264813e+05,631,24.766513,...,2.991667,3.322500,0.094081,0.417189,0.000140,0.500064,0.499796,0,-1.404520e-03,0
506710,2020-01-01 00:10:00+00:00,7193.15,7193.53,7180.24,7180.97,48.068510,2020-01-01 00:14:59.999000+00:00,3.454465e+05,694,19.422283,...,2.473333,4.164167,0.069263,0.404054,0.000140,0.500064,0.499796,0,-3.732086e-04,0
506711,2020-01-01 00:15:00+00:00,7180.97,7186.40,7177.35,7178.29,32.192929,2020-01-01 00:19:59.999000+00:00,2.311626e+05,576,12.963258,...,2.473333,3.784167,0.055891,0.402674,0.000140,0.500064,0.499796,0,-1.852809e-04,0
506712,2020-01-01 00:20:00+00:00,7177.71,7182.46,7175.47,7176.96,49.027397,2020-01-01 00:24:59.999000+00:00,3.519279e+05,710,22.819744,...,2.473333,3.330833,0.069053,0.465449,0.000140,0.500064,0.499796,0,2.076088e-04,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013411,2025-05-09 23:40:00+00:00,102920.65,102930.16,102900.00,102930.16,9.835260,2025-05-09 23:44:59.999000+00:00,1.012148e+06,1609,4.272570,...,14.711667,16.340833,0.006113,0.434414,0.000057,0.818578,0.181365,0,1.827453e-04,0
1013412,2025-05-09 23:45:00+00:00,102930.16,102950.75,102919.86,102948.97,12.717220,2025-05-09 23:49:59.999000+00:00,1.309100e+06,2413,3.356830,...,16.279167,12.595000,0.005270,0.263959,0.000057,0.818578,0.181365,0,2.237031e-04,0
1013413,2025-05-09 23:50:00+00:00,102948.97,102972.00,102948.97,102972.00,17.077310,2025-05-09 23:54:59.999000+00:00,1.758342e+06,1739,5.546290,...,18.198333,5.949167,0.009820,0.324775,0.000057,0.818578,0.181365,0,-9.711378e-08,0
1013414,2025-05-09 23:55:00+00:00,102971.99,102972.00,102971.99,102971.99,13.393400,2025-05-09 23:59:59.999000+00:00,1.379145e+06,1126,3.547240,...,18.198333,2.587500,0.011895,0.264850,0.000057,0.818578,0.181365,0,1.170221e-03,1


### Walk Forward Train Val Test Split

In [45]:
cols_to_drop = ['Close Time', 'Ticker', 'msg_count', 'future_return']
model_df = btc_final_df.drop(columns=cols_to_drop).copy()

In [49]:
def generate_walk_forward_splits_and_dataframes(df):
    splits = []
    min_time = df['Open Time'].min()
    max_time = df['Open Time'].max()

    train_years = 2.5
    val_months = 9
    test_months = 9
    step_months = 9

    years_int = int(train_years)
    months_int = int((train_years - years_int) * 12)

    current_train_end = (min_time + pd.DateOffset(years=years_int, months=months_int) - pd.Timedelta(seconds=1))

    fold_idx = 1

    while True:
        val_start = current_train_end + pd.Timedelta(seconds=1)
        val_end = val_start + pd.DateOffset(months=val_months) - pd.Timedelta(seconds=1)

        test_start = val_end + pd.Timedelta(seconds=1)
        test_end = test_start + pd.DateOffset(months=test_months) - pd.Timedelta(seconds=1)

        if test_end > max_time:
            if test_start > max_time:
                break
            test_end = max_time  # Use remaining data

        splits.append({
            'train_start': min_time,
            'train_end': current_train_end,
            'val_start': val_start,
            'val_end': val_end,
            'test_start': test_start,
            'test_end': test_end
        })

        # Slice dataframes
        globals()[f"train_fold_{fold_idx}_df"] = df[
            (df['Open Time'] >= min_time) & (df['Open Time'] <= current_train_end)
        ]
        globals()[f"val_fold_{fold_idx}_df"] = df[
            (df['Open Time'] >= val_start) & (df['Open Time'] <= val_end)
        ]
        globals()[f"test_fold_{fold_idx}_df"] = df[
            (df['Open Time'] >= test_start) & (df['Open Time'] <= test_end)
        ]

        print(f"Fold {fold_idx}:")
        print(f"Train: {min_time} to {current_train_end}, rows={len(globals()[f'train_fold_{fold_idx}_df'])}")
        print(f"Val: {val_start} to {val_end}, rows={len(globals()[f'val_fold_{fold_idx}_df'])}")
        print(f"Test: {test_start} to {test_end}, rows={len(globals()[f'test_fold_{fold_idx}_df'])}")

        fold_idx += 1
        current_train_end = current_train_end + pd.DateOffset(months=step_months)

    return splits


In [50]:
splits = generate_walk_forward_splits_and_dataframes(model_df)

Fold 1:
Train: 2020-01-01 00:00:00+00:00 to 2022-06-30 23:59:59+00:00, rows=236099
Val: 2022-07-01 00:00:00+00:00 to 2023-03-31 23:59:59+00:00, rows=71024
Test: 2023-04-01 00:00:00+00:00 to 2023-12-31 23:59:59+00:00, rows=71232
Fold 2:
Train: 2020-01-01 00:00:00+00:00 to 2023-03-30 23:59:59+00:00, rows=306835
Val: 2023-03-31 00:00:00+00:00 to 2023-12-30 23:59:59+00:00, rows=71328
Test: 2023-12-31 00:00:00+00:00 to 2024-09-29 23:59:59+00:00, rows=70947
Fold 3:
Train: 2020-01-01 00:00:00+00:00 to 2023-12-30 23:59:59+00:00, rows=378163
Val: 2023-12-31 00:00:00+00:00 to 2024-09-29 23:59:59+00:00, rows=70947
Test: 2024-09-30 00:00:00+00:00 to 2025-05-10 00:00:00+00:00, rows=57598


In [51]:
folds_data = []
for i in range(1, len(splits)+1):
    folds_data.append({
        'train': globals()[f'train_fold_{i}_df'],
        'val': globals()[f'val_fold_{i}_df'],
        'test': globals()[f'test_fold_{i}_df']
    })

In [60]:
def create_sliding_windows(df, feature_cols, target_col, seq_len):
    X, y = [], []
    data = df[feature_cols].values
    target = df[target_col].values

    for i in range(len(df) - seq_len):
        X.append(data[i:i + seq_len])
        y.append(target[i + seq_len])

    return np.array(X), np.array(y)

def generate_fold_data(train_df, val_df, test_df, feature_cols, target_col, seq_len, fold_num):
    globals()[f'X_train_fold_{fold_num}'], globals()[f'y_train_fold_{fold_num}'] = create_sliding_windows(train_df, feature_cols, target_col, seq_len)
    globals()[f'X_val_fold_{fold_num}'], globals()[f'y_val_fold_{fold_num}'] = create_sliding_windows(val_df, feature_cols, target_col, seq_len)
    globals()[f'X_test_fold_{fold_num}'], globals()[f'y_test_fold_{fold_num}'] = create_sliding_windows(test_df, feature_cols, target_col, seq_len)

In [61]:
feature_cols = [col for col in model_df.columns if col not in ['Open Time', 'target']]
target_col = 'target'
seq_len = 12  # aka look back 12 periods aka 60 mins

# Fold 1
generate_fold_data(train_fold_1_df, val_fold_1_df, test_fold_1_df, feature_cols, target_col, seq_len, fold_num=1)
print(f"Fold 1 shapes:")
print(f"X_train_fold_1: {X_train_fold_1.shape}, y_train_fold_1: {y_train_fold_1.shape}")
print(f"X_val_fold_1: {X_val_fold_1.shape}, y_val_fold_1: {y_val_fold_1.shape}")
print(f"X_test_fold_1: {X_test_fold_1.shape}, y_test_fold_1: {y_test_fold_1.shape}")

# Fold 2
generate_fold_data(train_fold_2_df, val_fold_2_df, test_fold_2_df, feature_cols, target_col, seq_len, fold_num=2)
print(f"\nFold 2 shapes:")
print(f"X_train_fold_2: {X_train_fold_2.shape}, y_train_fold_2: {y_train_fold_2.shape}")
print(f"X_val_fold_2: {X_val_fold_2.shape}, y_val_fold_2: {y_val_fold_2.shape}")
print(f"X_test_fold_2: {X_test_fold_2.shape}, y_test_fold_2: {y_test_fold_2.shape}")

# Fold 3
generate_fold_data(train_fold_3_df, val_fold_3_df, test_fold_3_df, feature_cols, target_col, seq_len, fold_num=3)
print(f"\nFold 3 shapes:")
print(f"X_train_fold_3: {X_train_fold_3.shape}, y_train_fold_3: {y_train_fold_3.shape}")
print(f"X_val_fold_3: {X_val_fold_3.shape}, y_val_fold_3: {y_val_fold_3.shape}")
print(f"X_test_fold_3: {X_test_fold_3.shape}, y_test_fold_3: {y_test_fold_3.shape}")


Fold 1 shapes:
X_train_fold_1: (236087, 12, 53), y_train_fold_1: (236087,)
X_val_fold_1: (71012, 12, 53), y_val_fold_1: (71012,)
X_test_fold_1: (71220, 12, 53), y_test_fold_1: (71220,)

Fold 2 shapes:
X_train_fold_2: (306823, 12, 53), y_train_fold_2: (306823,)
X_val_fold_2: (71316, 12, 53), y_val_fold_2: (71316,)
X_test_fold_2: (70935, 12, 53), y_test_fold_2: (70935,)

Fold 3 shapes:
X_train_fold_3: (378151, 12, 53), y_train_fold_3: (378151,)
X_val_fold_3: (70935, 12, 53), y_val_fold_3: (70935,)
X_test_fold_3: (57586, 12, 53), y_test_fold_3: (57586,)


In [71]:
folds_data_windows = []

for i in range(1, 4):  # Assuming 3 folds
    folds_data_windows.append({
        'X_train': globals()[f'X_train_fold_{i}'],
        'y_train': globals()[f'y_train_fold_{i}'],
        'X_val': globals()[f'X_val_fold_{i}'],
        'y_val': globals()[f'y_val_fold_{i}'],
        'X_test': globals()[f'X_test_fold_{i}'],
        'y_test': globals()[f'y_test_fold_{i}']
    })

### LSTM

In [None]:
# def train_and_validate_lstm(X_train, y_train, X_val, y_val, input_dim, epochs=20, lr=1e-3):
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     model = LSTMModel(input_dim=input_dim).to(device)
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     criterion = nn.BCEWithLogitsLoss()

#     X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
#     y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
#     X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
#     y_val_t = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)

#     for epoch in range(epochs):
#         model.train()
#         optimizer.zero_grad()
#         out = model(X_train_t)
#         loss = criterion(out, y_train_t)
#         loss.backward()
#         optimizer.step()

#         model.eval()
#         with torch.no_grad():
#             val_out = model(X_val_t)
#             val_loss = criterion(val_out, y_val_t)

#         print(f"Epoch {epoch+1}: train loss {loss.item():.4f}, val loss {val_loss.item():.4f}")

#     return model

In [65]:
for fold_num in [1, 2, 3]:
    X_train = globals()[f'X_train_fold_{fold_num}']
    y_train = globals()[f'y_train_fold_{fold_num}']
    X_val = globals()[f'X_val_fold_{fold_num}']
    y_val = globals()[f'y_val_fold_{fold_num}']
    X_test = globals()[f'X_test_fold_{fold_num}']
    y_test = globals()[f'y_test_fold_{fold_num}']

    print(f"\n=== Fold {fold_num} ===")
    model = train_and_validate_lstm(X_train, y_train, X_val, y_val, input_dim=X_train.shape[2])

    # Evaluate on test
    model.eval()
    X_test_t = torch.tensor(X_test, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        preds = torch.sigmoid(model(X_test_t)).cpu().numpy()

    print(f"Fold {fold_num}: Test preds sample: {preds[:5].flatten()}")


=== Fold 1 ===
Epoch 1: train loss 0.7004, val loss 0.6773
Epoch 2: train loss 0.6820, val loss 0.6553
Epoch 3: train loss 0.6654, val loss 0.6346
Epoch 4: train loss 0.6499, val loss 0.6145
Epoch 5: train loss 0.6354, val loss 0.5946
Epoch 6: train loss 0.6218, val loss 0.5750
Epoch 7: train loss 0.6091, val loss 0.5560
Epoch 8: train loss 0.5974, val loss 0.5377
Epoch 9: train loss 0.5871, val loss 0.5204
Epoch 10: train loss 0.5781, val loss 0.5042
Epoch 11: train loss 0.5710, val loss 0.4896
Epoch 12: train loss 0.5661, val loss 0.4773
Epoch 13: train loss 0.5637, val loss 0.4677
Epoch 14: train loss 0.5640, val loss 0.4613
Epoch 15: train loss 0.5663, val loss 0.4577
Epoch 16: train loss 0.5692, val loss 0.4560
Epoch 17: train loss 0.5712, val loss 0.4555
Epoch 18: train loss 0.5719, val loss 0.4558
Epoch 19: train loss 0.5714, val loss 0.4567
Epoch 20: train loss 0.5702, val loss 0.4582
Fold 1: Test preds sample: [0.20806277 0.20831208 0.2084536  0.21011253 0.21197611]

=== Fold

KeyboardInterrupt: 

In [67]:
def train_val_tune_timeseries(
    model_builder,  # function that builds model (e.g., lambda input_dim, **params: LSTMModel(input_dim, **params))
    folds_data,  # list of dicts like before
    model_name='model',
    param_grid=None,  # dict: {'hidden_dim': [32, 64], 'num_layers': [1, 2]}
    threshold_grid=np.arange(0.4, 0.61, 0.1),
    scoring='f1',
    epochs=10,
    lr=1e-3
):
    input_dim = folds_data[0]['X_train'].shape[2]
    results = []
    best_overall_f1 = -1
    best_overall_state = None
    best_overall_params = None

    for fold_idx, fold in enumerate(folds_data, 1):
        X_train, y_train = fold['X_train'], fold['y_train']
        X_val, y_val = fold['X_val'], fold['y_val']
        X_test, y_test = fold['X_test'], fold['y_test']

        # Prepare param combinations
        import itertools
        keys, values = zip(*param_grid.items())
        param_combos = [dict(zip(keys, v)) for v in itertools.product(*values)]

        best_f1_fold = -1
        best_model_state_fold = None
        best_params_fold = None
        best_threshold_fold = None

        for params in param_combos:
            model = model_builder(input_dim, **params)
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            model = model.to(device)

            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = nn.BCEWithLogitsLoss()

            X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
            y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
            X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)

            # Train
            model.train()
            for _ in range(epochs):
                optimizer.zero_grad()
                out = model(X_train_t)
                loss = criterion(out, y_train_t)
                loss.backward()
                optimizer.step()

            # Val probs
            model.eval()
            with torch.no_grad():
                val_logits = model(X_val_t).cpu().numpy().flatten()
                val_probs = 1 / (1 + np.exp(-val_logits))

            # Tune threshold
            for threshold in threshold_grid:
                val_preds = (val_probs >= threshold).astype(int)
                precision = precision_score(y_val, val_preds, zero_division=0)
                recall = recall_score(y_val, val_preds, zero_division=0)
                f1 = f1_score(y_val, val_preds, zero_division=0)

                if f1 > best_f1_fold:
                    best_f1_fold = f1
                    best_model_state_fold = model.state_dict()
                    best_params_fold = params
                    best_threshold_fold = threshold

        # Test with best
        model = model_builder(input_dim, **best_params_fold)
        model.load_state_dict(best_model_state_fold)
        model = model.to(device)
        model.eval()

        X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
        with torch.no_grad():
            test_logits = model(X_test_t).cpu().numpy().flatten()
            test_probs = 1 / (1 + np.exp(-test_logits))
            test_preds = (test_probs >= best_threshold_fold).astype(int)

        test_precision = precision_score(y_test, test_preds, zero_division=0)
        test_recall = recall_score(y_test, test_preds, zero_division=0)
        test_f1 = f1_score(y_test, test_preds, zero_division=0)
        test_auc = roc_auc_score(y_test, test_probs)
        test_acc = (test_preds == y_test).mean()

        results.append({
            'Fold': fold_idx,
            'Test_Size': len(y_test),
            'Test_Acc': test_acc,
            'Test_Precision': test_precision,
            'Test_Recall': test_recall,
            'Test_f1': test_f1,
            'Test_AUC': test_auc,
            'best_threshold': best_threshold_fold,
            'best_params': best_params_fold,
            'Optimized_Score': test_f1,
            'Optimized_Score_Type': scoring
        })

        if test_f1 > best_overall_f1:
            best_overall_f1 = test_f1
            best_overall_state = best_model_state_fold
            best_overall_params = {
                'fold': fold_idx,
                'params': best_params_fold,
                'threshold': best_threshold_fold
            }

    # Save best
    torch.save(best_overall_state, f'{model_name}_best_model.pth')
    joblib.dump(best_overall_params, f'{model_name}_best_params.pkl')
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'{model_name}_results.csv', index=False)

    # Aggregate
    agg = {
        'Fold': 'Aggregate',
        'Test_Size': results_df['Test_Size'].sum(),
        'Test_Acc': np.average(results_df['Test_Acc'], weights=results_df['Test_Size']),
        'Test_Precision': np.average(results_df['Test_Precision'], weights=results_df['Test_Size']),
        'Test_Recall': np.average(results_df['Test_Recall'], weights=results_df['Test_Size']),
        'Test_f1': np.average(results_df['Test_f1'], weights=results_df['Test_Size']),
        'Test_AUC': np.average(results_df['Test_AUC'], weights=results_df['Test_Size']),
        'best_threshold': None,
        'best_params': None,
        'Optimized_Score': np.average(results_df['Test_f1'], weights=results_df['Test_Size']),
        'Optimized_Score_Type': scoring
    }
    results_df = pd.concat([results_df, pd.DataFrame([agg])], ignore_index=True)

    return results_df


In [75]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

param_grid = {
    'hidden_dim': [32, 64],
    'num_layers': [1, 2]
}

lstm_results_df = train_val_tune_timeseries(
    model_builder=lambda input_dim, hidden_dim, num_layers: LSTMModel(input_dim, hidden_dim, num_layers),
    folds_data=folds_data_windows,
    model_name='lstm',
    param_grid=param_grid,
    scoring='f1'
)

  results_df = pd.concat([results_df, pd.DataFrame([agg])], ignore_index=True)


In [78]:
lstm_results_df

Unnamed: 0,Fold,Test_Size,Test_Acc,Test_Precision,Test_Recall,Test_f1,Test_AUC,best_threshold,best_params,Optimized_Score,Optimized_Score_Type
0,1,71220,0.114687,0.114687,1.0,0.205774,0.49721,0.4,"{'hidden_dim': 32, 'num_layers': 2}",0.205774,f1
1,2,70935,0.517121,0.222157,0.625824,0.327911,0.584503,0.6,"{'hidden_dim': 32, 'num_layers': 1}",0.327911,f1
2,3,57586,0.453357,0.213568,0.714829,0.328878,0.555494,0.4,"{'hidden_dim': 32, 'num_layers': 2}",0.328878,f1
3,Aggregate,199741,0.355245,0.181361,0.784901,0.28464,0.545014,,,0.28464,f1


### TCN

In [None]:
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding):
        super().__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.relu = nn.ReLU()
        self.net = nn.Sequential(self.conv1, self.relu)

    def forward(self, x):
        return self.net(x)

class TCNModel(nn.Module):
    def __init__(self, input_dim, num_channels=[32, 32], kernel_size=3):
        super().__init__()
        layers = []
        in_channels = input_dim
        for out_channels in num_channels:
            layers.append(TemporalBlock(in_channels, out_channels, kernel_size, 1, 1, 1))
            in_channels = out_channels
        self.network = nn.Sequential(*layers)
        self.fc = nn.Linear(num_channels[-1], 1)

    def forward(self, x):
        x = x.transpose(1, 2)
        y = self.network(x)
        y = y[:, :, -1]
        y = self.fc(y)
        return y

param_grid_tcn = {
    'num_channels': [
        [32, 32],
        [64, 64]
    ],
    'kernel_size': [3, 5]
}

tcn_results_df = train_val_tune_timeseries(
    model_builder=lambda input_dim, num_channels, kernel_size:
        TCNModel(input_dim, num_channels, kernel_size),
    folds_data=folds_data_windows,
    model_name='tcn',
    param_grid=param_grid_tcn,
    scoring='f1'
)

tcn_results_df

  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  test_probs = 1 / (1 + np.exp(-test_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)
  test_probs = 1 / (1 + np.exp(-test_logits))
  WeightNorm.apply(module, name, dim)
  val_probs = 1 / (1 + np.exp(-val_logits))
  WeightNorm.apply(module, name, dim)


### Transformer

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_heads=2):
        super().__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim),
            num_layers=2
        )
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = x.transpose(0, 1)
        y = self.encoder(x)
        y = self.fc(y[-1])
        return y

param_grid_transformer = {
    'hidden_dim': [32, 64],
    'num_heads': [2, 4]
}

transformer_results_df = train_val_tune_timeseries(
    model_builder=lambda input_dim, hidden_dim, num_heads:
        TransformerModel(input_dim, hidden_dim, num_heads),
    folds_data=folds_data_windows,
    model_name='transformer',
    param_grid=param_grid_transformer,
    scoring='f1'
)