In [13]:
import numpy as np 
import pandas as pd 
import os 
import pickle
import json 
import math 
from transformers import (
    AdamW, 
    AutoConfig, 
    AutoModel, 
    AutoModelForSequenceClassification,
    AutoTokenizer, 
    AlbertTokenizer,
    get_linear_schedule_with_warmup
)
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from tqdm.auto import tqdm
import ccxt
import logging
import re

In [14]:
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)

set_global_logging_level(logging.ERROR, ["transformers", "nlp", "torch", "tensorflow", "tensorboard", "wandb"]) 

In [15]:
tokenizer = AlbertTokenizer.from_pretrained("totoro4007/cryptodeberta-base-all-finetuned") 
model = AutoModel.from_pretrained("totoro4007/cryptodeberta-base-all-finetuned") 
model.cuda() 
model.eval() 
print()




In [16]:
# read the most recent news dataframe 
news_df = pd.read_csv("full_news_october_1st.csv") 

In [17]:
device = torch.device("cuda") 

news_sentiment_dict = {} 

titles = news_df["titles"].values 
contents = news_df["contents"].values 
years = news_df["year"].values 
months = news_df["month"].values 
days = news_df["day"].values 
hours = news_df["hour"].values 

softmax_func = nn.Softmax(dim=1) 

for i in range(len(years)):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    news_sentiment_dict[datestr] = [] 

for i in tqdm(range(len(years)), desc="calculating news embedding vectors...", position=0, leave=True):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    title = str(titles[i]) 
    content = str(contents[i]) 
    inputs = tokenizer(title, content, return_tensors="pt", max_length=512, padding="max_length", truncation=True).to(device) 
    with torch.no_grad():
        embedding = model(**inputs)[0][:,0] # CLS pooling 
    embedding = embedding.detach().cpu().numpy() 
    news_sentiment_dict[datestr].append(embedding) 


calculating news embedding vectors...:   0%|          | 0/104826 [00:00<?, ?it/s]

In [18]:
import pickle

with open('deberta_news_embeddings.pickle', 'wb') as handle:
    pickle.dump(news_sentiment_dict, handle)

In [19]:
print("done!") 

done!
