In [9]:
import numpy as np 
import pandas as pd  
import math 
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from tqdm import tqdm
import ccxt

In [2]:
tokenizer = AutoTokenizer.from_pretrained("totoro4007/cryptodeberta-base-all-finetuned") 
model = AutoModelForSequenceClassification.from_pretrained("totoro4007/cryptodeberta-base-all-finetuned") 
model.cuda() 
model.eval() 
print()

Downloading:   0%|          | 0.00/661 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]




In [3]:
news_df = pd.read_csv("august_2022_coinness.csv") 

In [5]:
device = torch.device("cuda") 

news_sentiment_dict = {} 

titles = news_df["titles"].values 
contents = news_df["contents"].values 
years = news_df["year"].values 
months = news_df["month"].values 
days = news_df["day"].values 
hours = news_df["hour"].values 

m = nn.Softmax(dim=1) 

for i in range(len(years)):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    news_sentiment_dict[datestr] = [] 

for i in tqdm(range(len(years)), desc="calculating news sentiment scores", position=0, leave=True):
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    title = str(titles[i]) 
    content = str(contents[i]) 
    inputs = tokenizer(title, content, return_tensors="pt", max_length=512, padding="max_length", truncation=True).to(device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = m(logits)
    probs = probs.detach().cpu().numpy().flatten() 
    
    news_sentiment_dict[datestr].append(probs) 


calculating news sentiment scores: 100%|██████████| 101919/101919 [30:12<00:00, 56.22it/s]


In [6]:
import pickle

with open('news_sentiment_dict.pickle', 'wb') as handle:
    pickle.dump(news_sentiment_dict, handle)

with open('news_sentiment_dict.pickle', 'rb') as handle:
    b = pickle.load(handle)



In [10]:
import json 
with open("BTC_USDT-4h-10.json") as f: 
    d = json.load(f) 
    
chart_df = pd.DataFrame(d) 
chart_df = chart_df.rename(columns={0:"timestamp", 
                                    1:"open",
                                    2:"high",
                                    3:"low",
                                    4:"close",
                                    5:"volume"}) 

def process(df): 
    binance = ccxt.binance() 
    dates = df["timestamp"].values 
    timestamp = [] 
    for i in range(len(dates)):
        date_string = binance.iso8601(int(dates[i]))
        date_string = date_string[:10] + " " + date_string[11:-5] 
        timestamp.append(date_string) 
    df["datetime"] = timestamp 
    df = df.drop(columns={"timestamp"})  
    return df 

chart_df = process(chart_df) 

hours, days, months, years = [], [], [], [] 
for dt in tqdm(chart_df["datetime"]): 
    hour = pd.to_datetime(dt).hour 
    day = pd.to_datetime(dt).day 
    month = pd.to_datetime(dt).month 
    year = pd.to_datetime(dt).year 
    hours.append(hour) 
    days.append(day) 
    months.append(month) 
    years.append(year)  

chart_df["hour"] = hours 
chart_df["day"] = days 
chart_df["month"] = months  
chart_df["year"] = years 


chart_df.head()

100%|██████████| 10969/10969 [00:02<00:00, 5021.77it/s]


Unnamed: 0,open,high,low,close,volume,datetime,hour,day,month,year
0,4261.48,4349.99,4261.32,4349.99,82.088865,2017-08-17 04:00:00,4,17,8,2017
1,4333.32,4485.39,4333.32,4427.3,63.619882,2017-08-17 08:00:00,8,17,8,2017
2,4436.06,4485.39,4333.42,4352.34,174.562001,2017-08-17 12:00:00,12,17,8,2017
3,4352.33,4354.84,4200.74,4325.23,225.109716,2017-08-17 16:00:00,16,17,8,2017
4,4307.56,4369.69,4258.56,4285.08,249.769913,2017-08-17 20:00:00,20,17,8,2017


In [11]:
chart_datestr = {} 
chart_newscnt = {} 
years = chart_df["year"].values 
months = chart_df["month"].values 
days = chart_df["day"].values 
hours = chart_df["hour"].values 

cnt = 0 

for i in tqdm(range(len(years))): 
    datestr = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/' + str(hours[i]) 
    if hours[i] == 4:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        cnt = 0 # for calculating average 
        for d in [partial+str(i) for i in range(0,4)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 8:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(4,8)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 12:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(8,12)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 16:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(12,16)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 20:
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(16,20)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 
    elif hours[i] == 0: 
        partial = str(years[i]) + '/' + str(months[i]) + '/' + str(days[i]) + '/'  
        for d in [partial+str(i) for i in range(20,24)]:
            if d in news_sentiment_dict.keys():
                if datestr not in chart_datestr.keys(): 
                    chart_datestr[datestr] = torch.tensor([0,0,0], dtype=torch.float32)
                for arr in news_sentiment_dict[d]:
                    chart_datestr[datestr] += arr 


100%|██████████| 10969/10969 [00:04<00:00, 2281.81it/s]


In [25]:
positives, negatives, neutrals = [], [], [] 
positive_cnts, negative_cnts, neutral_cnts = [], [], [] 

flag = False
for i in tqdm(range(len(years)), position=0, leave=True):
    datestr = str(years[i]) + "/" + str(months[i]) + "/" + str(days[i]) + "/" + str(hours[i]) 
    if datestr in chart_datestr.keys(): 
        v = chart_datestr[datestr] 
        probs = nn.Softmax()(torch.tensor([v[0], v[1]])) 
        positives.append(probs[0].item()) 
        negatives.append(probs[1].item()) 
    else:
        positives.append(0) 
        negatives.append(0) 

  probs = nn.Softmax()(torch.tensor([v[0], v[1]]))
100%|██████████| 10969/10969 [00:00<00:00, 39984.50it/s]


In [26]:
chart_df["positive_sentiment"] = positives 
chart_df["negative_sentiment"] = negatives 

In [27]:
chart_df.tail()

Unnamed: 0,open,high,low,close,volume,datetime,hour,day,month,year,positive_sentiment,negative_sentiment
10964,21210.06,21285.08,21069.11,21251.38,21730.29782,2022-08-21 04:00:00,4,21,8,2022,0.415333,0.584667
10965,21250.21,21570.0,21224.48,21524.39,34690.60227,2022-08-21 08:00:00,8,21,8,2022,0.0,0.0
10966,21525.57,21587.74,21314.88,21420.2,24843.69947,2022-08-21 12:00:00,12,21,8,2022,0.0,0.0
10967,21419.0,21557.26,21330.75,21512.01,21034.03801,2022-08-21 16:00:00,16,21,8,2022,0.0,0.0
10968,21512.01,21800.0,21277.73,21515.61,35006.75105,2022-08-21 20:00:00,20,21,8,2022,0.0,0.0


In [30]:
chart_df.to_csv("chart_df_with_deberta_sentiments.csv",index=False) 