In [656]:
import numpy as np 
import pandas as pd 
import os 
import random 
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm 
import torch 
import torch.nn as nn 
import time 
import datetime 
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor 
import json 
import ccxt 
from datetime import datetime, timedelta 
import seaborn as sns 
import pandas_ta as ta 
import matplotlib.pyplot as plt 
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score 

In [2]:
df = pd.read_csv("tokenpost_속보.csv") 
train_news = pd.read_csv("most_recent_btcusdt/train_finetune.csv")
test_news = pd.read_csv("most_recent_btcusdt/test_finetune.csv") 

df.shape, train_news.shape, test_news.shape

((78331, 3), (10755, 7), (1077, 7))

In [3]:
def add_datetime(df):
    dates = df['dates'].values 
    years, months, days, hours = [], [], [], [] 
    for i in tqdm(range(len(dates)), position=0, leave=True):
        splitted = dates[i].split(' ') 
        d1 = splitted[0].split('-') 
        years.append(int(d1[0])) 
        months.append(int(d1[1])) 
        days.append(int(d1[2])) 
        t = splitted[2].split(":") 
        hours.append(int(t[0]))  
    df['year'] = years 
    df['month'] = months 
    df['day'] = days 
    df['hour'] = hours 
    df.rename(columns={'titles':'title', 'contents':'content'},inplace=True)
    return df 

In [4]:
df = add_datetime(df)

100%|██████████| 78331/78331 [00:00<00:00, 489372.33it/s]


In [5]:
df.drop(columns={'dates'},inplace=True)
df.head(3)

Unnamed: 0,title,content,year,month,day,hour
0,"바이든, 마이클 바 연준 부의장 지명... 전 리플 이사회 멤버",코인데스크에 따르면 바이든 미국 대통령이 미국 재무부 소속 공무원이자 전 리플(XR...,2022,4,16,0
1,스위스 취리히 공항에 대형 비트코인 광고판 등장,"암호화폐 전문 미디어 비트코인매거진에 따르면, 스위스 취리히 공항에 대형 비트코인 ...",2022,4,16,0
2,"BTC, 지난 5분간 1.18% 상승",BTC가 바이낸스 USDT 마켓 기준 지난 5분간 1.18% 상승했다. 현재 BTC...,2022,4,15,23


In [6]:
train_news.drop(columns={"labels"}, inplace=True) 
test_news.drop(columns={"labels"}, inplace=True) 

train_news.head(3)

Unnamed: 0,title,content,year,month,day,hour
0,"미나 재단-이더리움 재단, 닐 재단과 120만달러 규모 계약 체결",미나 재단과 이더리움 재단이 닐 재단 크립토3 팀과 120만달러 규모의 계약을 체결...,2021,10,4,6
1,"암호화폐 환치기 기승, 불법 외환거래 올해 1조2000억 적발",중앙일보에 따르면 4일 국회 기획재정위원회 양경숙 더불어민주당 의원이 관세청으로부터...,2021,10,4,6
2,"서베이 ""아프리카 투자자 절반, 자녀 교육비 위해 암호화폐 투자""","크립토포테이토에 따르면 암호화폐 업체 루노가 나이지리아, 케냐, 남아공, 영국, 호...",2021,10,4,6


In [7]:
test_news.head(3)

Unnamed: 0,title,content,year,month,day,hour
0,NFT 민팅에 55만 달러 가스비 지불 트레이더 등장,암호화폐 전문 미디어 크립토슬레이트가 BAYC #4098를 보유한 트위터 사용자 @...,2022,2,24,13
1,"도지코인 공동 개발자 ""밈코인, 부자가 되려는 사람들을 통해 부자가 되려는 것""","암호화폐 전문 미디어 코인텔레그래프에 따르면, 도지코인(DOGE) 공동 개발자 빌리...",2022,2,24,14
2,"제미니, 암호화폐 로비 단체 CCI에 회원사로 합류","암호화폐 전문 미디어 더블록에 따르면, 미국 암호화폐 거래소 제미니가 최근 암호화폐...",2022,2,24,14


In [8]:
df = pd.concat([df, train_news, test_news], axis=0) 

df = df.drop_duplicates() 

df.shape

(90082, 6)

In [13]:
df = df.sort_values(by=['year','month','day','hour'])
df

Unnamed: 0,title,content,year,month,day,hour
11853,"케이사인, IoT 보안·블록체인 기술 개발 강화",정보보안 전문기업 케이사인이 4차 산업 시대의 핵심인 사물인터넷(IoT) 보안과 블...,2018,1,19,14
78329,"美 테네시 주, 블록체인 법안 도입 검토…""긍정적""",테네시 주 의회가 법안 심리에서 법률 전자 기록에 블록체인 서명 사용을 승인하는 법...,2018,1,26,15
78330,"노원구, 블록체인 기반 지역화폐 '노원(NW)' 가맹점 모집",노원구가 국내 최초 블록체인 기반 지역화폐 '노원(NW)'의 시행을 앞두고 가맹점을...,2018,1,26,15
78328,"日 정부, 일본 내 모든 암호화폐 거래소 긴급 실태조사…코인체크 사건 후폭풍",일본 정부가 최근 암호화폐 거래소 코인체크 해킹 사건을 계기로 모든 거래소에 대해 ...,2018,1,30,16
78326,"中 첫 암호화폐 거래소, 홍콩 펀드에 인수",중국의 첫 비트코인 거래소가 홍콩 블록체인 투자 펀드에 인수됐다. 30일(현지시간)...,2018,1,30,17
...,...,...,...,...,...,...
2,"BTC, 지난 5분간 1.18% 상승",BTC가 바이낸스 USDT 마켓 기준 지난 5분간 1.18% 상승했다. 현재 BTC...,2022,4,15,23
3,"토네이도캐시, 美 OFAC 지정 제재 대상 주소 차단","암호화폐 전문 미디어 더블록에 따르면, 이더리움(ETH) 트랜잭션 믹싱 플랫폼 토네...",2022,4,15,23
4,"폴카닷 기반 바이프로스트, 쿠사마 네트워크서 USDT 크로스체인 이체 지원",폴카닷(DOT) 파라체인 프로젝트 바이프로스트(BNC)가 15일 공식 트위터를 통해...,2022,4,15,23
0,"바이든, 마이클 바 연준 부의장 지명... 전 리플 이사회 멤버",코인데스크에 따르면 바이든 미국 대통령이 미국 재무부 소속 공무원이자 전 리플(XR...,2022,4,16,0


In [22]:
df.fillna('', inplace=True) 
df

Unnamed: 0,title,content,year,month,day,hour
11853,"케이사인, IoT 보안·블록체인 기술 개발 강화",정보보안 전문기업 케이사인이 4차 산업 시대의 핵심인 사물인터넷(IoT) 보안과 블...,2018,1,19,14
78329,"美 테네시 주, 블록체인 법안 도입 검토…""긍정적""",테네시 주 의회가 법안 심리에서 법률 전자 기록에 블록체인 서명 사용을 승인하는 법...,2018,1,26,15
78330,"노원구, 블록체인 기반 지역화폐 '노원(NW)' 가맹점 모집",노원구가 국내 최초 블록체인 기반 지역화폐 '노원(NW)'의 시행을 앞두고 가맹점을...,2018,1,26,15
78328,"日 정부, 일본 내 모든 암호화폐 거래소 긴급 실태조사…코인체크 사건 후폭풍",일본 정부가 최근 암호화폐 거래소 코인체크 해킹 사건을 계기로 모든 거래소에 대해 ...,2018,1,30,16
78326,"中 첫 암호화폐 거래소, 홍콩 펀드에 인수",중국의 첫 비트코인 거래소가 홍콩 블록체인 투자 펀드에 인수됐다. 30일(현지시간)...,2018,1,30,17
...,...,...,...,...,...,...
2,"BTC, 지난 5분간 1.18% 상승",BTC가 바이낸스 USDT 마켓 기준 지난 5분간 1.18% 상승했다. 현재 BTC...,2022,4,15,23
3,"토네이도캐시, 美 OFAC 지정 제재 대상 주소 차단","암호화폐 전문 미디어 더블록에 따르면, 이더리움(ETH) 트랜잭션 믹싱 플랫폼 토네...",2022,4,15,23
4,"폴카닷 기반 바이프로스트, 쿠사마 네트워크서 USDT 크로스체인 이체 지원",폴카닷(DOT) 파라체인 프로젝트 바이프로스트(BNC)가 15일 공식 트위터를 통해...,2022,4,15,23
0,"바이든, 마이클 바 연준 부의장 지명... 전 리플 이사회 멤버",코인데스크에 따르면 바이든 미국 대통령이 미국 재무부 소속 공무원이자 전 리플(XR...,2022,4,16,0


In [136]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("CryptoModel/cryptodeberta-base-all-finetuned", use_auth_token="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")

model = AutoModelForSequenceClassification.from_pretrained("CryptoModel/cryptodeberta-base-all-finetuned", use_auth_token="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", num_labels=3)

model.eval() 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [137]:
def deberta_tokenizer(s1,s2,MAX_LEN=512): 
    encoded_dict = tokenizer.encode_plus(
        text = s1, 
        text_pair = s2, 
        add_special_tokens = True, 
        pad_to_max_length = True, 
        max_length = MAX_LEN, 
        return_attention_mask = True 
    )
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] 
    token_type_id = encoded_dict['token_type_ids'] 
    return input_id, attention_mask, token_type_id

In [138]:
s1,s2 = df['title'].values[10], df['content'].values[10]

In [139]:
device = torch.device("cuda") 

model.to(device)
inputs = tokenizer(s1,s2, return_tensors="pt").to(device)
with torch.no_grad():
    logits = model(**inputs).logits

logits

tensor([[-0.7674,  0.3125,  0.3399]], device='cuda:0')

In [140]:
model.cuda()

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [141]:
# let us create a dictionary 
# keys are dates and values are sum of sentiment scores for that hour 
news_sentiment_dict = {} 

device = torch.device('cuda')

years = df['year'].values 
months = df['month'].values 
days = df['day'].values 
hours = df['hour'].values 
titles = df['title'].values 
contents = df['content'].values 

model.eval() 

m = nn.Softmax(dim=1) 

for i in range(len(years)):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    news_sentiment_dict[datestr] = [] 

for i in tqdm(range(len(years)), position=0, leave=True):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    title = titles[i] 
    content = contents[i] 
    inputs = tokenizer(s1,s2, return_tensors="pt").to(device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = m(logits)
    probs = probs.detach().cpu().numpy().flatten() 
    
    news_sentiment_dict[datestr].append(probs) 

100%|██████████| 90082/90082 [46:29<00:00, 32.29it/s]


In [310]:
news_sentiment_dict 

{'2018/1/19/14': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/1/26/15': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32),
  array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/1/30/16': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/1/30/17': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32),
  array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/1/31/12': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/1/31/16': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32),
  array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/2/1/10': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32),
  array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/2/1/12': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/2/1/13': [array([0.14346546, 0.4224012 , 0.43413332], dtype=float32)],
 '2018/2/1/14': [array([0.14346546, 0.422401

In [717]:
# load dataframe 
import json

with open('BTC_USDT-4h-3.json') as f:
    d = json.load(f)
    
chart_df = pd.DataFrame(d)
chart_df = chart_df.rename(columns={0:"timestamp",
                                    1:"open",
                                    2:"high",
                                    3:"low",
                                    4:"close",
                                    5:"volume"})

def process(df): 
    binance = ccxt.binance() 
    dates = df['timestamp'].values 
    timestamp = [] 
    for i in range(len(dates)): 
        date_string = binance.iso8601(int(dates[i])) 
        date_string = date_string[:10] + " " + date_string[11:-5] 
        timestamp.append(date_string) 
    df['datetime'] = timestamp 
    df = df.drop(columns={'timestamp'})
    return df

chart_df = process(chart_df)

hours = []
days = [] 
months = [] 
years = [] 
for dt in tqdm(chart_df['datetime']):
    hour = pd.to_datetime(dt).hour 
    day = pd.to_datetime(dt).day 
    month = pd.to_datetime(dt).month 
    year = pd.to_datetime(dt).year 
    hours.append(hour) 
    days.append(day) 
    months.append(month)
    years.append(year) 

chart_df['hour'] = hours
chart_df['day'] = days 
chart_df['month'] = months 
chart_df['year'] = years 

targets = [] 
close = chart_df['open'].values 
high = chart_df['high'].values 
low = chart_df['low'].values 

threshold = 0.0075

for i in range(close.shape[0]-1):
    high_volatility = (high[i+1]-close[i]) / close[i] 
    low_volatility = (low[i+1]-close[i]) / close[i] 
    if high_volatility >= threshold:
        targets.append(0) 
    elif low_volatility <= -threshold:
        targets.append(1) 
    else:
        targets.append(2) # do not trade 

targets.append(None) 

chart_df['Targets'] = targets 
        
chart_df.set_index(pd.DatetimeIndex(chart_df["datetime"]), inplace=True)

print("=== Feature Engineering ===")
chart_df['ebsw'] = chart_df.ta.ebsw(lookahead=False)
chart_df['cmf'] = chart_df.ta.cmf(lookahead=False)
chart_df['vwap'] = chart_df.ta.vwap(lookahead=False)  
chart_df['vwap/open'] = chart_df['vwap'] / chart_df['open']
chart_df['high/low'] = chart_df['high'] / chart_df['low'] 
chart_df['close/open'] = chart_df['close'] / chart_df['open']
chart_df['high/open'] = chart_df['high'] / chart_df['open'] 
chart_df['low/open'] = chart_df['low'] / chart_df['open']

# differencing 
for l in range(1,6): 
    for col in ['open','high','low','close','volume']: 
        val = chart_df[col].values 
        val_ret = [None for _ in range(l)] 
        for i in range(l, len(val)):
            if val[i-l] == 0:
                ret = 1
            else:
                ret = val[i] / val[i-l] 
            val_ret.append(ret) 
        chart_df['{}_change_{}'.format(col, l)] = val_ret   


chart_df = chart_df.dropna()  
chart_df = chart_df.drop(columns={'datetime', 
                                  'open', 
                                  'high', 
                                  'low', 
                                  'close', 
                                  'volume', 
                                  'vwap'}) 

chart_df.head(2)

100%|██████████| 10244/10244 [00:05<00:00, 2018.58it/s]


=== Feature Engineering ===


Unnamed: 0_level_0,hour,day,month,year,Targets,ebsw,cmf,vwap/open,high/low,close/open,...,open_change_4,high_change_4,low_change_4,close_change_4,volume_change_4,open_change_5,high_change_5,low_change_5,close_change_5,volume_change_5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-23 16:00:00,16,23,8,2017,1.0,0.0,0.121501,0.980956,1.038033,0.974007,...,1.046272,1.031877,1.022327,1.004063,1.09382,1.060846,1.04876,1.040436,1.019077,1.406636
2017-08-23 20:00:00,20,23,8,2017,0.0,0.57735,0.09759,1.001464,1.026746,0.994568,...,1.012716,1.015253,1.008545,1.009603,1.075979,1.023881,1.012272,1.013929,1.003317,0.955893


In [718]:
chart_datestr = {}
chart_newscnt = {} 
years = chart_df['year'].values
months = chart_df['month'].values 
days = chart_df['day'].values 
hours = chart_df['hour'].values 

cnt = 0
for i in tqdm(range(len(years))): 
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    if hours[i] == 4:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        cnt = 0 # for calculating average 
        for d in [partial+str(i) for i in range(0,4)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 8:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(4,8)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 12:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(8,12)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 16:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(12,16)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 20:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(16,20)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 0: 
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(20,24)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 

100%|██████████| 10204/10204 [00:06<00:00, 1600.94it/s]


In [719]:
chart_datestr

{'2018/1/19/16': tensor([0.1435, 0.4224, 0.4341]),
 '2018/1/26/16': tensor([0.2869, 0.8448, 0.8683]),
 '2018/1/30/20': tensor([0.4304, 1.2672, 1.3024]),
 '2018/1/31/16': tensor([0.1435, 0.4224, 0.4341]),
 '2018/1/31/20': tensor([0.2869, 0.8448, 0.8683]),
 '2018/2/1/12': tensor([0.2869, 0.8448, 0.8683]),
 '2018/2/1/16': tensor([0.4304, 1.2672, 1.3024]),
 '2018/2/1/20': tensor([0.5739, 1.6896, 1.7365]),
 '2018/2/2/12': tensor([0.4304, 1.2672, 1.3024]),
 '2018/2/2/16': tensor([0.2869, 0.8448, 0.8683]),
 '2018/2/2/20': tensor([0.8608, 2.5344, 2.6048]),
 '2018/2/5/12': tensor([0.7173, 2.1120, 2.1707]),
 '2018/2/5/16': tensor([0.4304, 1.2672, 1.3024]),
 '2018/2/5/20': tensor([0.1435, 0.4224, 0.4341]),
 '2018/2/6/12': tensor([0.5739, 1.6896, 1.7365]),
 '2018/2/7/16': tensor([0.2869, 0.8448, 0.8683]),
 '2018/2/7/20': tensor([1.0043, 2.9568, 3.0389]),
 '2018/2/9/12': tensor([0.5739, 1.6896, 1.7365]),
 '2018/2/9/16': tensor([0.2869, 0.8448, 0.8683]),
 '2018/2/12/12': tensor([0.1435, 0.4224, 0.43

In [720]:
positives, negatives, neutrals = [], [], [] 
positive_cnts, negative_cnts, neutral_cnts = [],[],[] 
for i in tqdm(range(len(years)), position=0, leave=True):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i])
    if datestr in chart_datestr.keys():
        v = chart_datestr[datestr]
        positives.append(v[0].item()) 
        negatives.append(v[1].item()) 
        neutrals.append(v[2].item())  
    else:
        positives.append(0) 
        negatives.append(0) 
        neutrals.append(0)  
        

100%|██████████| 10204/10204 [00:00<00:00, 89766.09it/s]


In [721]:
chart_df['positive'] = positives 
chart_df['negative'] = negatives

In [722]:
train_size = int(chart_df.shape[0] * 0.95) 
train_df = chart_df.iloc[:train_size,:] 

val_size = int(chart_df.shape[0] * 0.01) 
val_df = chart_df.iloc[train_size:train_size+val_size,:] 

test_df = chart_df.iloc[train_size+val_size:,:] 

train_df.shape, val_df.shape, test_df.shape 

((9693, 39), (102, 39), (409, 39))

In [723]:
input_columns = [] 
for col in train_df.columns:
    if col != 'Targets' and col != 'year':
        input_columns.append(col)         

In [724]:
X_train = train_df[input_columns].values  
Y_train = train_df['Targets'].values

X_val = val_df[input_columns].values 
Y_val = val_df['Targets'].values 

X_test = test_df[input_columns].values
Y_test = test_df['Targets'].values


X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape  

((9693, 37), (9693,), (102, 37), (102,), (409, 37), (409,))

In [725]:
class_weights = compute_class_weight(class_weight = "balanced", 
                                     classes = np.unique(Y_train), 
                                     y = Y_train)

d = {0:class_weights[0], 1:class_weights[1], 2:class_weights[2]}
d

{0: 0.6463292658531706, 1: 0.9348958333333334, 2: 2.609854604200323}

In [726]:
clf = TabNetClassifier() 

clf.fit(X_train, 
        Y_train, 
        eval_set=[(X_val, Y_val)], 
        eval_metric=['logloss', 'balanced_accuracy'],
        weights = d, 
        max_epochs=150,
        patience=150)  

Device used : cuda
epoch 0  | loss: 1.22103 | val_0_logloss: 1.21128 | val_0_balanced_accuracy: 0.32323 |  0:00:00s
epoch 1  | loss: 1.00354 | val_0_logloss: 1.29213 | val_0_balanced_accuracy: 0.31167 |  0:00:01s
epoch 2  | loss: 0.93123 | val_0_logloss: 1.05266 | val_0_balanced_accuracy: 0.3025  |  0:00:02s
epoch 3  | loss: 0.82693 | val_0_logloss: 1.03152 | val_0_balanced_accuracy: 0.37108 |  0:00:02s
epoch 4  | loss: 0.75175 | val_0_logloss: 1.04538 | val_0_balanced_accuracy: 0.3328  |  0:00:03s
epoch 5  | loss: 0.69415 | val_0_logloss: 1.60295 | val_0_balanced_accuracy: 0.30409 |  0:00:04s
epoch 6  | loss: 0.66492 | val_0_logloss: 2.16037 | val_0_balanced_accuracy: 0.33333 |  0:00:04s
epoch 7  | loss: 0.66896 | val_0_logloss: 1.93327 | val_0_balanced_accuracy: 0.33333 |  0:00:05s
epoch 8  | loss: 0.64867 | val_0_logloss: 2.07285 | val_0_balanced_accuracy: 0.33333 |  0:00:06s
epoch 9  | loss: 0.63398 | val_0_logloss: 1.68686 | val_0_balanced_accuracy: 0.33333 |  0:00:06s
epoch 10 | 

epoch 85 | loss: 0.58024 | val_0_logloss: 0.67507 | val_0_balanced_accuracy: 0.66746 |  0:00:59s
epoch 86 | loss: 0.58077 | val_0_logloss: 0.64682 | val_0_balanced_accuracy: 0.69524 |  0:00:59s
epoch 87 | loss: 0.58057 | val_0_logloss: 0.67189 | val_0_balanced_accuracy: 0.67172 |  0:01:00s
epoch 88 | loss: 0.56361 | val_0_logloss: 0.61971 | val_0_balanced_accuracy: 0.73644 |  0:01:01s
epoch 89 | loss: 0.58579 | val_0_logloss: 0.63898 | val_0_balanced_accuracy: 0.70534 |  0:01:01s
epoch 90 | loss: 0.5761  | val_0_logloss: 0.6357  | val_0_balanced_accuracy: 0.67172 |  0:01:02s
epoch 91 | loss: 0.57163 | val_0_logloss: 0.63227 | val_0_balanced_accuracy: 0.7306  |  0:01:03s
epoch 92 | loss: 0.58569 | val_0_logloss: 0.66911 | val_0_balanced_accuracy: 0.66175 |  0:01:03s
epoch 93 | loss: 0.57405 | val_0_logloss: 0.695   | val_0_balanced_accuracy: 0.65843 |  0:01:04s
epoch 94 | loss: 0.56469 | val_0_logloss: 0.64944 | val_0_balanced_accuracy: 0.68687 |  0:01:05s
epoch 95 | loss: 0.57728 | val

In [727]:
cnt = 0
pred = clf.predict(X_test)
for i in range(len(pred)):
    if Y_test[i] == float(pred[i]):
        cnt += 1 
        
cnt / len(pred) * 100

71.39364303178483

In [728]:
print(f1_score(Y_test, pred, average='macro'))
print(f1_score(Y_test, pred, average='micro'))
print(f1_score(Y_test, pred, average='weighted')) 

0.6829709452660273
0.7139364303178484
0.737543189505192


In [406]:
# clf.save_model("1hour_with_news")

Successfully saved model at 1hour_with_news.zip


'1hour_with_news.zip'

In [None]:
clf.save_model("1hour_with_news_less_features")