In [25]:
import os
from os import listdir
from os.path import isfile, join
import dotenv
import numpy as np
import pandas as pd
import requests
import tweepy as tw
import glob
import mlflow
from datasets import load_dataset #, load_metric
import evaluate
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          DataCollatorWithPadding, TextClassificationPipeline,
                          Trainer, TrainingArguments)
mlflow.end_run()
from utils.io import load_yaml


In [26]:
#let's load the params
config_dict = load_yaml('./config/btc-config.yaml')
model_name = config_dict['model_name']
sentiment = {'bearish':0, 'neutral':1, 'bullish':2}
input_dir = config_dict['input_dir']
output_dir = config_dict['output_dir']
epochs = int(config_dict['epochs'])
#Load env variables
dotenv.load_dotenv(dotenv.find_dotenv())


True

In [27]:
class LoadTweets:
  def __init__(self, config_dict):
    bearer_token = os.getenv('BEARER_TOKEN')
    self.client = tw.Client(bearer_token,  return_type=requests.Response, wait_on_rate_limit=True)
    self.query = config_dict['v2_query']

  def get_tweets(self, max_tweets=10):
    tweets = self.client.search_recent_tweets(query=self.query, 
                                              tweet_fields=['text'], max_results=max_tweets).json()['data']
    return [tweet['text'] for tweet in tweets]
    
  def load_dataset(self, data_files, get_tokens_function, seed=12):
    dataset = load_dataset("csv", data_files=data_files, delimiter=",")
    print(dataset)
    tokenized_dataset = dataset.map(get_tokens_function, batched=True)
    train = tokenized_dataset["train"].shuffle(seed=seed)
    test = tokenized_dataset["test"].shuffle(seed=seed)
    validation = tokenized_dataset.get("validation")
    if validation:
      return train, test, validation.shuffle(seed=seed)
    else:
      return train, test, None

In [28]:
class RoBERTa_sentiment():
    def __init__(self, name: str):
        self.model_name = name
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels = 3)
        self.pipe = TextClassificationPipeline(model=self.model, tokenizer=self.tokenizer)

    def predict(self, tweets):
        preds = self.pipe(tweets)
        df = pd.DataFrame(preds)
        df.insert(0, "tweet", tweets, True)
        return df
    
    def get_token(self, row):
        return self.tokenizer(row["tweet"], padding="max_length", truncation=True)


In [29]:
model = RoBERTa_sentiment(model_name)
LoadData = LoadTweets(config_dict)


train_files=glob.glob(input_dir+'/train/'+'*.csv')
test_files=glob.glob(input_dir+'/test/'+'*.csv')
data_files={"train": train_files, "test": test_files}

train, test, _ = LoadData.load_dataset(data_files, model.get_token)

data_collator = DataCollatorWithPadding(tokenizer=model.tokenizer, return_tensors='pt', max_length=256,
                                        padding='max_length')

#metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

loading file vocab.json from cache at /home/csh/.cache/huggingface/hub/models--ElKulako--cryptobert/snapshots/9db31c7cef52d339197e2245afc302efbb080b83/vocab.json
loading file merges.txt from cache at /home/csh/.cache/huggingface/hub/models--ElKulako--cryptobert/snapshots/9db31c7cef52d339197e2245afc302efbb080b83/merges.txt
loading file tokenizer.json from cache at /home/csh/.cache/huggingface/hub/models--ElKulako--cryptobert/snapshots/9db31c7cef52d339197e2245afc302efbb080b83/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/csh/.cache/huggingface/hub/models--ElKulako--cryptobert/snapshots/9db31c7cef52d339197e2245afc302efbb080b83/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/csh/.cache/huggingface/hub/models--ElKulako--cryptobert/snapshots/9db31c7cef52d339197e2245afc302efbb080b83/tokenizer_config.json
loading configuration file config.json from cache at /home/csh/.cache/huggingface/hu

  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tweet', 'labels'],
        num_rows: 1082
    })
    test: Dataset({
        features: ['tweet', 'labels'],
        num_rows: 385
    })
})


  0%|          | 0/2 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
# We can add more metrics here
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [44]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir = True,
    num_train_epochs=epochs,
    save_total_limit = 3,
    save_strategy = "steps",
    evaluation_strategy = "steps",
    load_best_model_at_end=True,
    learning_rate = 5e-5,
    warmup_steps=500,
    logging_steps=5,
)

using `logging_steps` to initialize `eval_steps` to 5
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [45]:
trainer = Trainer(
    model=model.model, 
    args=training_args, 
    train_dataset=train, 
    eval_dataset=test,
    tokenizer=model.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1082
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 136


Step,Training Loss,Validation Loss,Accuracy
5,0.0037,0.92405,0.8
10,0.0023,0.933451,0.8
15,0.0101,0.939597,0.797403
20,0.0727,0.923127,0.797403
25,0.1838,0.897016,0.802597
30,0.159,0.885911,0.812987
35,0.0925,0.91953,0.812987
40,0.1714,0.974112,0.794805
45,0.0954,0.982147,0.797403
50,0.276,0.955195,0.802597


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 385
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 385
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet. If tweet are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 385
  Batch size