In [1]:
import sys
sys.path.append('..')
from lib.download import *
from tqdm import tqdm
import numpy as np

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [16]:
def get_driver():
    options = Options()
    options.binary_location = r"C:\Program Files\Mozilla Firefox\firefox.exe"
    
    capabilities = DesiredCapabilities().FIREFOX
    capabilities["marionette"] = False
    capabilities["firefoxOptions"] = {"args": ["--headless"]}
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
    }
    
    for header, value in headers.items():
        options.set_preference(f"general.useragent.override", headers["User-Agent"])

    driver = webdriver.Firefox(options=options)
    return driver

def ticker_news_dataset(ticker):
    driver = get_driver()
    url = f"https://www.tradingview.com/symbols/BTCUSDT/news/?sort=recent"
    driver.get(url)
    
    times, titles, contents, sources = [], [], [], [] 
    hrefs =[hrf.get_attribute("href") for hrf in driver.find_elements(By.CLASS_NAME, 'card-gaCYEutU')] 
    
    
    for href in tqdm(hrefs): 
        driver.get(href)
        source = driver.find_elements(By.CLASS_NAME,"logoLight-cR1GoJcP")[0].get_attribute("alt")
        time = driver.find_elements(By.CSS_SELECTOR,"time[datetime]")[0].text
        title = driver.find_elements(By.CLASS_NAME, 'title-jEK_kEtx')[0].text
        content = [text.text for text in driver.find_elements(By.CSS_SELECTOR, "div.body-jEK_kEtx.body-op4L5uvo.content-op4L5uvo span p")]
        corresps = [[times,time],[contents,content],[titles,title],[sources,source]]  
        for corresp in corresps:
            corresp[0].append(corresp[1])
    contents = [' '.join(content) for content in contents]
    data = pd.DataFrame({'time' : times, 'title' : titles, 'source' : sources, 'article' : contents})
    data['time'] = pd.to_datetime(data['time'], format="%b %d, %Y%H:%M") + pd.Timedelta(hours=1)
    data['time'] = data['time'].dt.strftime('%Y-%m-%d %H:%M')
    data[["pos","neg","neut"]] = np.nan
    return data

def chunk_weights(chunk_len):
    return [512] * (chunk_len // 512) + [chunk_len % 512] if chunk_len else []

def article_sentiment(df, article_index):
    txt = df['article'][article_index]
    window_size = 510
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False)
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    total_len = len(input_ids)
    
    probs_list = []
    for start in range(0, total_len, window_size):
        end = min(start + window_size, total_len)
        input_ids_chunk = input_ids[start:end]
        attention_mask_chunk = attention_mask[start:end]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        padding = [0] * (window_size - len(input_ids_chunk) + 2)
        input_ids_chunk += padding
        attention_mask_chunk += padding
        input_dict = {
            'input_ids': torch.LongTensor([input_ids_chunk]),
            'attention_mask': torch.IntTensor([attention_mask_chunk])
        }
        outputs = model(**input_dict)
        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
        probs_list.append(probs)

    cw = chunk_weights(total_len)
    weights = [x/total_len for x in cw]
    weighted_probs = torch.FloatTensor(probs_list[0].shape)
    for chunk, weight in zip(probs_list, weights):
        weighted_probs += chunk * weight
    weighted_probs = weighted_probs.detach().cpu().numpy()[0] 
    df['pos'].iloc[article_index] = weighted_probs[0]
    df['neg'].iloc[article_index] = weighted_probs[1]
    df['neut'].iloc[article_index] = weighted_probs[2]
    return df

In [18]:
sentiment = ticker_news_dataset('BTCUSDT')  

100%|██████████| 200/200 [02:12<00:00,  1.51it/s]


In [19]:
for i in tqdm(range(0,200)):
    try:
        sentiment = article_sentiment(sentiment,i)   
    except:
        None

100%|██████████| 200/200 [06:33<00:00,  1.97s/it]


In [20]:
sentiment

Unnamed: 0,time,title,source,article,pos,neg,neut
0,2023-02-08 23:05,TeraWulf Mined More BTC in January Due to Redu...,CryptoPotato,The US-based cryptocurrency miner TeraWulf pro...,2.016704e-01,0.718170,0.080160
1,2023-02-08 22:00,Another Bitcoin Metric Turns Bullish With Pric...,NewsBTC,Bitcoin has been moving sideways during this w...,6.682447e-01,0.273339,0.058416
2,2023-02-08 22:00,YouTuber baits MMA fighter to promote fake NFT...,Cointelegraph,"In this week’s newsletter, read about how Yuga...",3.061095e-02,0.350789,0.618600
3,2023-02-08 20:33,Trust in Cryptocurrencies Collapses: Poll,U.Today,Trust in cryptocurrencies took a significant h...,1.711266e-02,0.949906,0.032981
4,2023-02-08 20:08,"Price analysis 2/8: BTC, ETH, BNB, XRP, ADA, D...",Cointelegraph,The United States Federal Reserve Chairman Jer...,1.393739e+00,1.544395,1.462955
...,...,...,...,...,...,...,...
195,2023-02-01 08:14,Decentralized Twitter alternative goes live on...,Cointelegraph,"Damus, a so-called “Twitter killer” built on a...",5.418042e-02,0.341019,0.604800
196,2023-02-01 05:07,Bitcoin Price At Risk of Downside Thrust Befor...,NewsBTC,Bitcoin price is rising and approaching a majo...,1.466158e-01,0.521583,0.331801
197,2023-02-01 00:36,Bitcoin on-chain data and BTC’s recent price r...,Cointelegraph,Bitcoin (BTC) had a rough time all throughout ...,3.255655e+38,0.229859,0.379011
198,2023-01-31 23:00,Bitcoin ($BTC) Flashes ‘Extremely Rare’ Bullis...,CryptoGlobe,A top cryptocurrency analyst that has gained a...,6.644673e-02,0.195967,0.737586


In [85]:
sentiment.to_excel('test.xlsx')

In [None]:
# Calculating the hourly average sentiment score
def get_daily_average_sentiment_score(df):
    return df.groupby("date")["sentiment_score"].mean()

In [None]:
output => sentiment moyen par journal + poids  + changer usd/eur + vader (twitter reddit ideas)

In [None]:
essayer avec btcusd aussi, essayer avec ideas, essayer de pondérer par paragraphe (pour eviter biais des sentiumlents qui peuvnt changer en fonction de la ccl)

In [None]:
historical features