In [1]:
import os
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import os
import numpy as np

In [2]:
#If you don't have the train and dev files, you have to register on the Codalab competition to get access to the data
#https://codalab.lisn.upsaclay.fr/competitions/17730

path_to_trainfile = os.path.join("..","data", "exalt_emotion_train.tsv")
train_file = pd.read_csv(path_to_trainfile, sep="\t")

path_to_dev_file = os.path.join("..","data", "exalt_emotion_dev_participants.tsv")
dev_file =pd.read_csv(path_to_dev_file, sep="\t")

In [3]:
unique_labels = train_file["Labels"].unique().tolist()
print("{} Unique Labels: {}".format(len(unique_labels), unique_labels))

label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

6 Unique Labels: ['Joy', 'Neutral', 'Sadness', 'Love', 'Anger', 'Fear']


In [4]:
from sklearn.model_selection import train_test_split

#We split the TRAIN data into a TRAIN & DEV set
traindf, devdf = train_test_split(train_file, test_size=0.1, random_state=42)

#We will get the predictions on the DEV data and submit to Codalab 
testdf = dev_file

In [5]:
datasets = DatasetDict({
    "train": Dataset.from_pandas(traindf),
    "dev" : Dataset.from_pandas(devdf),
    "test": Dataset.from_pandas(testdf)
    })

In [6]:
MODEL_NAME = "xlm-roberta-base"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    if "Labels" in examples:
        examples["label"] = [label2id[x] for x in examples["Labels"]]
    return tokenizer(examples["Texts"], truncation=True)


tokenized_datasets = datasets.map(preprocess_function, batched=True)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def custom_metrics(eval_pred):
    metric1 = load_metric("precision", trust_remote_code=True)
    metric2 = load_metric("recall", trust_remote_code=True)
    metric3 = load_metric("f1", trust_remote_code=True)
    metric4 = load_metric("accuracy", trust_remote_code=True)
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=4e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # load_best_model_at_end=True,
    save_total_limit=2,
    overwrite_output_dir=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_metrics,

)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.98,1.374867,0.488664,0.436292,0.431642,0.566
2,0.8068,1.749279,0.481157,0.495097,0.470254,0.564
3,0.7501,2.260848,0.544075,0.511825,0.510749,0.594
4,0.6413,2.134241,0.494237,0.513378,0.497726,0.604
5,0.5044,2.769083,0.517468,0.509275,0.505801,0.586
6,0.4152,2.84267,0.518546,0.518891,0.516861,0.608
7,0.2206,3.202114,0.513886,0.515375,0.50429,0.592
8,0.1657,3.321933,0.503249,0.508785,0.502567,0.58
9,0.1077,3.379265,0.51338,0.519011,0.512982,0.598
10,0.0586,3.434682,0.510965,0.522103,0.512233,0.588


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=11250, training_loss=0.4579190306769477, metrics={'train_runtime': 1142.1699, 'train_samples_per_second': 39.399, 'train_steps_per_second': 9.85, 'total_flos': 1410064841023632.0, 'train_loss': 0.4579190306769477, 'epoch': 10.0})

In [15]:
results = trainer.predict(tokenized_datasets["test"])

In [16]:
results = np.argmax(results.predictions, axis=-1)
results = [id2label[x] for x in results]
print(results)

['Joy', 'Joy', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Joy', 'Neutral', 'Sadness', 'Joy', 'Neutral', 'Joy', 'Joy', 'Anger', 'Neutral', 'Anger', 'Sadness', 'Joy', 'Joy', 'Sadness', 'Sadness', 'Neutral', 'Joy', 'Joy', 'Joy', 'Anger', 'Neutral', 'Neutral', 'Neutral', 'Anger', 'Anger', 'Joy', 'Sadness', 'Joy', 'Anger', 'Anger', 'Fear', 'Fear', 'Neutral', 'Sadness', 'Neutral', 'Love', 'Neutral', 'Neutral', 'Neutral', 'Joy', 'Sadness', 'Joy', 'Love', 'Neutral', 'Sadness', 'Anger', 'Neutral', 'Love', 'Neutral', 'Neutral', 'Anger', 'Sadness', 'Neutral', 'Neutral', 'Joy', 'Fear', 'Anger', 'Anger', 'Joy', 'Sadness', 'Sadness', 'Joy', 'Anger', 'Anger', 'Joy', 'Sadness', 'Anger', 'Joy', 'Sadness', 'Anger', 'Joy', 'Fear', 'Neutral', 'Joy', 'Joy', 'Neutral', 'Joy', 'Anger', 'Neutral', 'Joy', 'Neutral', 'Sadness', 'Love', 'Neutral', 'Neutral', 'Joy', 'Sadness', 'Sadness', 'Neutral', 'Sadness', 'Joy', 'Joy', 'Anger', 'Anger', 'Fear', 'Anger', 'Neutral', 'Joy', 'Joy', 'Sadness', 'Anger', 'Joy', 'L

In [1]:
testdf["Labels"] = results
testdf.to_csv("Emotions.tsv", sep="\t", index=False)

#We will get the predictions on the DEV data and submit to Codalab
#Remember to first ZIP the file before submitting to Codalab
#This is the same submission marked as EXALT_Baseline on Codalab.

NameError: name 'results' is not defined

In [None]:
trainer.push_to_hub("pranaydeeps/EXALT-Baseline", token="") #To push the model to the huggingface hub