# Set up

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules

print("In Colab:", IN_COLAB)

In Colab: True


In [2]:
if IN_COLAB:
    !pip install -qqq torch 'transformers[torch]' datasets evaluate

[K     |██████████████████████████████  | 834.1 MB 1.2 MB/s eta 0:00:48tcmalloc: large alloc 1147494400 bytes == 0x64e70000 @  0x7fc5e1a33615 0x58ead6 0x4f355e 0x4d222f 0x51041f 0x5b4ee6 0x58ff2e 0x510325 0x5b4ee6 0x58ff2e 0x50d482 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4d00fb 0x50cb8d 0x4bac0a 0x538a76 0x590ae5 0x510280 0x5b4ee6 0x58ff2e 0x50d482 0x5b4ee6 0x58ff2e 0x50c4fc 0x58fd37 0x50ca37 0x5b4ee6 0x58ff2e
[K     |████████████████████████████████| 890.2 MB 6.7 kB/s 
[K     |████████████████████████████████| 5.5 MB 50.9 MB/s 
[K     |████████████████████████████████| 441 kB 51.9 MB/s 
[K     |████████████████████████████████| 72 kB 1.3 MB/s 
[K     |████████████████████████████████| 21.0 MB 1.3 MB/s 
[K     |████████████████████████████████| 849 kB 45.1 MB/s 
[K     |████████████████████████████████| 317.1 MB 26 kB/s 
[K     |████████████████████████████████| 557.1 MB 11 kB/s 
[K     |████████████████████████████████| 95 kB 4.1 MB/s 
[K     |████████████████████████████████

In [3]:
if IN_COLAB:
    from google.colab import drive

    drive.mount('/content/drive', force_remount=True)

    data_path = '/content/drive/MyDrive/openai-hackathon/data/'


Mounted at /content/drive


# Imports

In [4]:
import os
import pandas as pd

from datasets import DatasetDict, load_dataset, load_from_disk, Dataset

import evaluate

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    get_scheduler
)

from tqdm.auto import tqdm

# Load data

In [91]:
def generate_dataset(filepath: str, seed = None, validation_split=0.05, test_split=0.05, sep=",") -> DatasetDict:
    """Load dataset from filepath"""

    # This is not working on the new dataset
    # ds = load_dataset("csv", data_files=filepath, sep=sep)
    df = pd.read_csv(filepath, sep=sep)
    
    # TODO: Fix columns with NaN.
    df = df[["topic", "sentiment", "stance", "text", "aggressiveness"]]
    df.dropna(inplace=True)
    
    ds = Dataset.from_pandas(df)
    ds = ds.remove_columns(['__index_level_0__'])

    ds = ds.class_encode_column("topic")
    ds = ds.class_encode_column("stance")
    
    # Create a function that converts the `created_at` column to a datetime object
    # def convert_to_datetime(example):
    #     example["created_at"] = pd.to_datetime(example["created_at"])
    #     return example

    # ds = ds.map(convert_to_datetime)

    # Split dataset into train, validation, and test
    ds = ds.shuffle(seed=seed)
    ds_first_split = ds.train_test_split(test_size=test_split)
    ds_second_split = ds_first_split["train"].train_test_split(test_size=validation_split)
    ds = DatasetDict(
        {
            "train": ds_second_split["train"],
            "validation": ds_second_split["test"],
            "test": ds_first_split["test"],
        }
    )

    final_ds_size = sum([len(ds[split]) for split in ds.keys()])
    assert len(df) == final_ds_size, "Dataset size mismatch"

    return ds

In [92]:
ds = generate_dataset(f"{data_path}/tweets2019_rehydrated.csv.gz", sep="\t")

  """Entry point for launching an IPython kernel.


Casting to class labels:   0%|          | 0/410 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/41 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/410 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/41 [00:00<?, ?ba/s]

# Tokenizing the dataset

In [95]:
def dataloaders(ds: DatasetDict, tokenizer, data_collator) -> DatasetDict:
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_ds = ds.map(tokenize_function, batched=True)

    # Clean Dataset
    tokenized_ds = tokenized_ds.remove_columns(
        ["sentiment", "stance", "text", "aggressiveness"]
    )
    tokenized_ds = tokenized_ds.rename_column("topic", "labels")

    # Create dataloaders
    dls = {}
    for split in tokenized_ds.keys():
        dls[split] = DataLoader(
            tokenized_ds[split],
            shuffle=True,
            batch_size=8,
            collate_fn=data_collator,
        )

    return dls

In [96]:
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [98]:
dls = dataloaders(ds, tokenizer, data_collator)

  0%|          | 0/370 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

# Fine-tuning the model

In [99]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=10, ignore_mismatched_sizes=True
)

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [100]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(dls['train'])
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

138687


In [101]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device);

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in dls['train']:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/138687 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model.eval()

# I tried using the `evaluate.combine` method and the `add_batch`, but I was
# getting an an error caused by the multiple classes.

# metrics = evaluate.combine(
#     [
#         evaluate.load("accuracy"),
#         evaluate.load("f1"),
#         evaluate.load("precision"),
#         evaluate.load("recall"),
#         evaluate.load("roc_auc", "multiclass"),
#     ]
# )

labels = []
preds = []
all_preds = []
for batch in dls['validation']:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    softmax = torch.softmax(logits, dim=-1)
    predictions = torch.argmax(softmax, dim=-1)

    labels.extend(batch["labels"].tolist())
    preds.extend(predictions.tolist())
    all_preds.extend(softmax.tolist())


f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
accuracy_metric = evaluate.load("accuracy")
roc_auc_metric = evaluate.load("roc_auc", "multiclass")

# Wrap all the metrics above in a dictionary
metrics = {
    "f1": f1_metric,
    "recall": recall_metric,
    "precision": precision_metric,
    "accuracy": accuracy_metric,
    "roc_auc": roc_auc_metric,
}

# Compute the metrics
results = {}
for metric_name, metric in metrics.items():
    print(metric_name)
    if metric_name == "accuracy":
        r = metric.compute(predictions=preds, references=labels)
    elif metric_name == "roc_auc":
        r = metric.compute(
            prediction_scores=all_preds, references=labels, multi_class="ovr"
        )
    else:
        r = metric.compute(predictions=preds, references=labels, average="weighted")
    results.update(r)

print(results)