In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/twitter-bullbear/verysmol.csv


Базовая модель - сентимент классифаер, обученный на твиттах
3 сентимента - положительный(label_2), нейтральный(label_1), негативный(label_0)

Датасет - заголовки с новостного сайта, каждый размечен на положительный/негативный. С помощью него можно real-time мониторить ситуацию на рынке и привязать к нему торгового бота. Из-за ограничений API получилось спарсить только 400 заголовков.

In [33]:
from transformers import (
    AutoModelForSequenceClassification,
    TFAutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    Trainer,
    TrainingArguments,
)

import random
import csv
import urllib.request
from scipy.special import softmax
from sklearn.model_selection import train_test_split
import torch

In [34]:
def set_seed(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
set_seed(0)

In [26]:
df = pd.read_csv('/kaggle/input/twitter-bullbear/verysmol.csv', sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,target,title,source,currencies,url,date
0,0,bearish,Founder of Crypto Capital Venture: $ADA and $X...,cryptoglobe.com,"['XRP', 'Cardano']",https://cryptopanic.com/news/19025185/Founder-...,2023-11-08T19:29:48Z
1,1,bearish,Cardano’s Steady Progress: Foundation CEO Fred...,cryptoglobe.com,['Cardano'],https://cryptopanic.com/news/19025145/Cardanos...,2023-11-08T19:07:29Z
2,2,bearish,ChatGPT predicts price of Cardano for start of...,finbold.com,['Cardano'],https://cryptopanic.com/news/19024549/ChatGPT-...,2023-11-08T15:16:20Z
3,3,bearish,FTX Moves $316 Million Crypto to Different Exc...,coinpaprika.com,['Solana'],https://cryptopanic.com/news/19023693/FTX-Move...,2023-11-08T10:37:04Z
4,4,bearish,Cardano upgrade delays tied to ‘boring’ academ...,cointelegraph.com,['Cardano'],https://cryptopanic.com/news/19023555/Cardano-...,2023-11-08T09:46:00Z


In [5]:
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = df.title.iloc[0]
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

1) LABEL_1 0.6496
2) LABEL_2 0.3446
3) LABEL_0 0.0058


Видим, что изначальная модель плохо реагирует на новый вид разметки.

In [27]:
df['target'] = df['target'].map({'bearish': 0, 'bullish': 1})
df = df.drop(index=0, axis=0)
dataset = df[['target', 'title']].rename(columns={'target': 'label', 'title': 'text'})
dataset.head()

Unnamed: 0,label,text
1,0,Cardano’s Steady Progress: Foundation CEO Fred...
2,0,ChatGPT predicts price of Cardano for start of...
3,0,FTX Moves $316 Million Crypto to Different Exc...
4,0,Cardano upgrade delays tied to ‘boring’ academ...
5,0,Should Cardano holders be prepared for a price...


In [28]:
text = dataset['text'].tolist()
label = dataset['label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(text, label, test_size=.2)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [29]:
class smolDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [30]:
train_dataset = smolDataset(train_encodings, train_labels)
val_dataset = smolDataset(val_encodings, val_labels)

In [31]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10, # т.к. датасет очень маленький, то решил, что нет смысла ставить >10 эпох
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Step,Training Loss
10,0.4163
20,0.1941
30,0.2209
40,0.2513
50,0.2401
60,0.2058
70,0.1935
80,0.157
90,0.1891
100,0.0883


TrainOutput(global_step=200, training_loss=0.19013698309659957, metrics={'train_runtime': 690.3193, 'train_samples_per_second': 4.621, 'train_steps_per_second': 0.29, 'total_flos': 60654837293820.0, 'train_loss': 0.19013698309659957, 'epoch': 10.0})

In [44]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")
type(dataset)

  0%|          | 0/2 [00:00<?, ?it/s]

datasets.dataset_dict.DatasetDict

In [32]:
text = df.title.iloc[0]
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) LABEL_0 0.9636
2) LABEL_1 0.0347
3) LABEL_2 0.0017


Теперь видим, что все работает прекрасно. 0-я строка не была в датасете, поэтому проверка на нем является достаточно корректной, больше строк из датасета решил не убирать, т.к. он изначально очень маленький.