In [None]:
!pip install transformers==4.17.0
!pip install datasets==1.18.3
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import gc
import os
import random
import pickle

import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_metric
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
HYPERPARAMS = {}
HYPERPARAMS["DROPOUT_PROB"] = 0.3
HYPERPARAMS["LEARNING_RATE"] = 2e-5
HYPERPARAMS["BATCH_SIZE"] = 8
HYPERPARAMS["EPOCHS"] = 4
HYPERPARAMS["RNG_SEED"] = 0
HYPERPARAMS["MAX_LEN"] = 256

folder_name = '/content/drive/MyDrive/IR_Project/Tweet Sentiment Analysis /BTC_tweets_daily_example.csv'

In [None]:
def load_pickle(file_name):
  with open(os.path.join(folder_name, file_name+".pkl"),"rb") as f:
    data=pickle.load(f)
  return data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR_Project/Tweet Sentiment Analysis /BTC_tweets_daily_example.csv')
df = df.dropna()
df['text'] = df['Tweet']
df['label'] = df['New_Sentiment_State'] 
df_final = df[['text', 'label']]
df = df_final
# df = df[:30000]
df_train, df_test = train_test_split(df, test_size=0.3)

print(len(df_train), len(df_test))

df_train, df_val = train_test_split(df_train, test_size=0.3)

print(len(df_train), len(df_val))

df_train = df_train.reset_index()
df_train = df_train.drop(columns='index')
df_test = df_test.reset_index()
df_test = df_test.drop(columns='index')
df_val = df_val.reset_index()
df_val = df_val.drop(columns='index')

print(len(df_train), len(df_test), len(df_val))

pos_train = df_train[df_train['label']==1]
pos_train = list(pos_train['text'])

neg_train = df_train[df_train['label']==-1]
neg_train = list(neg_train['text'])

neu_train = df_train[df_train['label']==0]
neu_train = list(neu_train['text'])

print("Training : ", len(pos_train), len(neg_train), len(neu_train))

pos_test = df_test[df_test['label']==1]
pos_test = list(pos_test['text'])

neg_test = df_test[df_test['label']==-1]
neg_test = list(neg_test['text'])

neu_test = df_test[df_test['label']==0]
neu_test = list(neu_test['text'])

print("Testing : ", len(pos_test), len(neg_test), len(neu_test))

pos_val = df_val[df_val['label']==1]
pos_val = list(pos_val['text'])

neg_val = df_val[df_val['label']==-1]
neg_val = list(neg_val['text'])

neu_val = df_val[df_val['label']==0]
neu_val = list(neu_val['text'])

print("Validation : ", len(pos_val), len(neg_val), len(neu_val))

35596 15256
24917 10679
24917 15256 10679
Training :  11064 3213 10640
Testing :  6746 1980 6530
Validation :  4735 1372 4572


In [None]:
print("POS NEG NEU")
print("TRAIN", len(pos_train), len(neg_train), len(neu_train))
print("TEST", len(pos_test), len(neg_test), len(neu_test))
print("VAL", len(pos_val), len(neg_val), len(neu_val))

POS NEG NEU
TRAIN 11064 3213 10640
TEST 6746 1980 6530
VAL 4735 1372 4572


In [None]:
train_texts = []
train_texts.extend(pos_train)
train_texts.extend(neg_train)
train_texts.extend(neu_train)

train_labels=[2]*len(pos_train)
train_labels.extend([0]*len(neg_train))
train_labels.extend([1]*len(neu_train))

c = list(zip(train_texts, train_labels))
random.shuffle(c)

train_texts, train_labels = zip(*c)
train_texts, train_labels = list(train_texts), list(train_labels)
print(len(train_texts),len(train_labels))

24917 24917


In [None]:
val_texts = []
val_texts.extend(pos_val) 
val_texts.extend(neg_val)
val_texts.extend(neu_val)

val_labels = [2]*len(pos_val)
val_labels.extend([1]*len(neg_val))
val_labels.extend([0]*len(neu_val))

c = list(zip(val_texts, val_labels))
random.shuffle(c)

val_texts, val_labels = zip(*c)
val_texts, val_labels = list(val_texts), list(val_labels)
print(len(val_texts), len(val_labels))

10679 10679


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

    
def tokenize_function(text_list):
    # device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # return tokenizer(text_list,padding="max_length",return_tensors='pt', truncation=True,max_length=HYPERPARAMS["MAX_LEN"]).to(device)
    
    return tokenizer(text_list,padding="max_length",return_tensors='pt', truncation=True,max_length=HYPERPARAMS["MAX_LEN"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels,predictions)
    pre = precision_score(labels,predictions, average='macro')
    rec = recall_score(labels,predictions, average='macro')
    f1 = f1_score(labels,predictions, average='macro')
    crp = classification_report(labels, predictions,output_dict=True)
    return {"accuracy": acc, "precision": pre, "recall": rec, "f1": f1,"classification_report_dict":crp}

In [None]:
## use vinai/bertweet-base for bertwteet.
# task='sentiment'
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# print(f"Device : {device}")
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, normalization=True,padding=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, 
                                                           num_labels=3)
model.to("cuda:0")
X_train_tokenized = tokenize_function(train_texts)
X_val_tokenized = tokenize_function(val_texts)
train_dataset = Dataset(X_train_tokenized, train_labels)
val_dataset = Dataset(X_val_tokenized, val_labels)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/7dd97280b5338fb674b5372829a05a1aaaa76f9f2fa71c36199f2ce1ee1104a0.4c7ca95b4fd82b8bbe94fde253f5f82e5a4eedefe6f86f6fa79efc903d6cfe60
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_ep

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
repo_name = "finetuning-sentiment-model-25000-samples"
training_args = TrainingArguments(output_dir=repo_name,
                                  overwrite_output_dir=True, 
                                  do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=HYPERPARAMS["BATCH_SIZE"],
                                  per_device_eval_batch_size=HYPERPARAMS["BATCH_SIZE"],
                                  learning_rate=HYPERPARAMS["LEARNING_RATE"],
                                  num_train_epochs=HYPERPARAMS["EPOCHS"],
                                  seed=HYPERPARAMS["RNG_SEED"],
                                  evaluation_strategy="epoch", 
                                  save_strategy="epoch",
                                  push_to_hub=True)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset,compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/content/finetuning-sentiment-model-3000-samples is already a clone of https://huggingface.co/yshAggarwal/finetuning-sentiment-model-3000-samples. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 24917
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12460
  import sys


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
trainer.evaluate()

In [None]:

trainer.push_to_hub()

In [None]:
from transformers import pipeline

sentiment_model = pipeline(model="federicopascual/finetuning-sentiment-model-3000-samples")

sentiment_model(["I love this move", "This movie sucks!"])