In [None]:
!pip install transformers
!pip install mlflow
!pip install evaluate
!pip install wandb
!pip install --upgrade accelerate

In [2]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
from imblearn.datasets import make_imbalance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, TrainingArguments, Trainer, BertModel
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset,load_dataset, load_from_disk, DatasetDict
import evaluate
import os
import io

In [3]:
import wandb

In [4]:
# Login to W&B
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
wandb.init(
    # set the wandb project where this run will be logged
    project="01_06")

[34m[1mwandb[0m: Currently logged in as: [33mk-kjoerup[0m ([33mccs-project[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
# Load model and tokenizer

tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest', num_labels=3)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# import data
data = pd.read_csv('text_emoji.csv', index_col = 0)

In [None]:
# Delete party column
#data = data.drop(['party'], axis = 1)

In [13]:
data = data.astype({'label': 'int32'})
#type(data['label'].iloc[0])

### Fine-tuning model

In [14]:
# split data

dataset = Dataset.from_pandas(data, preserve_index=False)

train_test = dataset.train_test_split(shuffle = True, seed = 200, test_size=0.3)
test_val = train_test['test'].train_test_split(shuffle = True, seed = 200, test_size=0.50)

train_val_test_dataset = DatasetDict({
    'train': train_test['train'],
    'test': test_val['test'],
    'dev': test_val['train']})

In [15]:
#Tokenize data

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True,max_length=128)
  
tokenized_datasets = train_val_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/174 [00:00<?, ? examples/s]

In [16]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
#Load the training arguments and hyperparameters
training_args = TrainingArguments(output_dir="/content/roberta_model",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=20,
                                  save_strategy = "epoch",
                                  load_best_model_at_end=True,
                                  learning_rate=1e-6,
                                  report_to="wandb",
                                  # args to prevent disk overload on Google Colab
                                  save_steps = -1,
                                  save_on_each_node = False,
                                  overwrite_output_dir = True
                                  )

In [18]:
#Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    compute_metrics=compute_metrics
)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.5573,0.504148,0.793103
2,0.5139,0.485776,0.804598
3,0.463,0.467872,0.810345
4,0.4538,0.458676,0.810345
5,0.4394,0.451624,0.816092
6,0.4122,0.443227,0.816092
7,0.4074,0.439675,0.821839
8,0.3856,0.440318,0.827586
9,0.3999,0.438961,0.827586
10,0.37,0.438055,0.827586


VBox(children=(Label(value='0.001 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.057246…

0,1
eval/accuracy,▁▃▃▃▄▄▅▆▆▆▆▆▆▆▆█▇▇▇▇
eval/loss,█▆▄▃▂▂▁▁▁▁▁▂▂▂▂▃▃▃▃▃
eval/runtime,▁▁▃▄▅█▆▆▆▆▇▆▆▆▇▆▆█▆▆
eval/samples_per_second,██▆▅▃▁▂▃▃▃▂▃▃▂▂▃▂▁▃▃
eval/steps_per_second,██▆▅▃▁▂▃▃▃▂▃▃▂▂▃▂▁▃▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
train/loss,█▇▅▅▅▄▄▃▄▃▂▂▂▂▂▂▂▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.83908
eval/loss,0.45528
eval/runtime,1.2683
eval/samples_per_second,137.186
eval/steps_per_second,8.673
train/epoch,20.0
train/global_step,1020.0
train/learning_rate,0.0
train/loss,0.3114
train/total_flos,1070871610152960.0


# Make predictions

In [19]:
predictions = trainer.predict(tokenized_datasets['test'])

In [20]:
#test metrics
predictions[2]

{'test_loss': 0.4905324876308441,
 'test_accuracy': 0.7828571428571428,
 'test_runtime': 1.4058,
 'test_samples_per_second': 124.487,
 'test_steps_per_second': 7.825}

In [21]:
pred_labels = np.array([prediction.argmax() for prediction in predictions[0]])

In [22]:
predictions[1]

array([0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 0, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0, 2, 0, 0,
       0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 2, 0, 0, 2, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 1, 1, 2, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0,
       2, 1, 0, 0, 1, 0, 0, 2, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 2])

In [23]:
pred_labels

array([0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0,
       0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 0, 2, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 2, 2, 1, 2, 1,
       0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 2])

In [24]:
test_df = pd.DataFrame(train_val_test_dataset['test'])

In [25]:

#this shows the distribution of annotated labels within each party
test_df.groupby('party')['label'].value_counts()

party          label
conservatives  0        57
               1        16
               2        13
labour         0        30
               1        19
               2        14
libdems        0        18
               2         5
               1         3
Name: label, dtype: int64

In [26]:
test_df['predicted_label'] = pred_labels

In [27]:
#this shows the distribution of predicted labels within each party
test_df.groupby('party')['predicted_label'].value_counts()

party          predicted_label
conservatives  0                  60
               2                  15
               1                  11
labour         0                  41
               2                  19
               1                   3
libdems        0                  17
               2                   5
               1                   4
Name: predicted_label, dtype: int64

In [28]:
#conservatives mean sentiment (just taking the mean of all the predicted labels)
test_df[test_df['party'] == 'conservatives']['predicted_label'].mean()

0.47674418604651164

In [29]:
#labour mean sentiment
test_df[test_df['party'] == 'labour']['predicted_label'].mean()

0.6507936507936508

In [30]:
#libdems mean sentiment
test_df[test_df['party'] == 'libdems']['predicted_label'].mean()

0.5384615384615384