In [3]:
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset, load_dataset, ClassLabel
import numpy as np
from typing import List
from transformers import DataCollatorWithPadding
import evaluate
import torch

In [None]:
%env CUDA_VISIBLE_DEVICES=2
%env TOKENIZERS_PARALLELISM=false
#torch.manual_seed(0)
#torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [4]:
tgt_lang = "de"
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
dataset = load_dataset("json", data_files=data_files)
dataset

Using custom data configuration default-e4b60b9fd844da79
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-e4b60b9fd844da79/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['talk_id', 'doc'],
        num_rows: 92
    })
})

In [5]:
id2label ={0: "anger", 1: "fear", 2: "joy", 3: "sadness"}
label2id = {"anger" : 0, "fear" : 1, "joy": 2, "sadness": 3}

# XLM-EMO

In [6]:
tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/xlm-emo-t")
model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/xlm-emo-t", num_labels=4, id2label=id2label, label2id=label2id)

# Predict tgt language gold sentence with emotion label

In [None]:
def preprocess_function(data):
    inputs = [sent for doc in data["doc"] for sent in doc[tgt_lang]][:50]
    return tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names,)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
# Accuracies for each tgt_language 
tgt_lang_list = ["ja", "de", "fr", "zh", "ar", "ko"]
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

lang_accuracies= {}
for tgt_lang in tgt_lang_list:
    data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
    dataset[tgt_lang] = load_dataset("json", data_files=data_files)

    pred_list = {}
    for lang in ["en", tgt_lang]:
        lang_pred_list = []
        print (lang)
        inputs = [sent for doc in dataset[tgt_lang]["test"]["doc"] for sent in doc[lang]]
        tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
        tokenized_inputs["input_ids"]
        with torch.no_grad():
            logits = model(**tokenized_inputs).logits
        for inst_logits in logits:
            predicted_class_id = inst_logits.argmax(dim=-1).item()
            #print (lang, "predicted Emotion")
            lang_pred_list.append(model.config.id2label[predicted_class_id])
        pred_list[tgt_lang] = lang_pred_list

pred_list

Using custom data configuration default-17fa6efc908acb86
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-17fa6efc908acb86/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


en
ja


Using custom data configuration default-e4b60b9fd844da79
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-e4b60b9fd844da79/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
de


Using custom data configuration default-911eedd4b6eede3d
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-911eedd4b6eede3d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
fr


Using custom data configuration default-cc3d46cbb1e70fd4
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-cc3d46cbb1e70fd4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
zh


Using custom data configuration default-94e223f2f26d1b2a
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-94e223f2f26d1b2a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
ar


Using custom data configuration default-3a9183ebb6967fe8
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-3a9183ebb6967fe8/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
ko


{'ko': ['joy',
  'joy',
  'anger',
  'anger',
  'anger',
  'joy',
  'joy',
  'joy',
  'joy',
  'joy',
  'joy',
  'anger',
  'anger',
  'joy',
  'sadness',
  'sadness',
  'anger',
  'joy',
  'joy',
  'anger',
  'joy',
  'anger',
  'joy',
  'anger',
  'sadness',
  'sadness',
  'sadness',
  'sadness',
  'anger',
  'anger',
  'sadness',
  'anger',
  'anger',
  'anger',
  'anger',
  'sadness',
  'anger',
  'sadness',
  'anger',
  'anger',
  'sadness',
  'anger',
  'anger',
  'joy',
  'joy',
  'anger',
  'fear',
  'fear',
  'anger',
  'joy',
  'joy',
  'joy',
  'fear',
  'anger',
  'fear',
  'anger',
  'anger',
  'joy',
  'anger',
  'joy',
  'joy',
  'sadness',
  'joy',
  'fear',
  'sadness',
  'sadness',
  'sadness',
  'joy',
  'joy',
  'joy',
  'sadness',
  'sadness',
  'joy',
  'sadness',
  'sadness',
  'joy',
  'joy',
  'joy',
  'sadness',
  'joy',
  'joy',
  'fear',
  'sadness',
  'joy',
  'sadness',
  'sadness',
  'joy',
  'sadness',
  'sadness',
  'sadness',
  'sadness',
  'joy',
  'a

In [None]:
pred_dict = pred_list


# Evalluate emotion label on the predicted translation

In [7]:
def preprocess_function(data):
    inputs = [sent for doc in data["doc"] for sent in doc["en"]][:50]
    #inputs = [kshot + sent + ' = ' for doc in data["doc"] for sent in doc["en"] ][:50]
    return tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names,)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
# Accuracies for each tgt_language 
tgt_lang_list = ["ja", "de", "fr", "zh", "ar", "ko"]
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

lang_accuracies= {}
for tgt_lang in tgt_lang_list:
    data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
    dataset[tgt_lang] = load_dataset("json", data_files=data_files)

    pred_list = []
    for lang in ["en", tgt_lang]:
        lang_pred_list = []
        print (lang)
        inputs = [sent for doc in dataset[tgt_lang]["test"]["doc"] for sent in doc[lang]]
        tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
        tokenized_inputs["input_ids"]
        with torch.no_grad():
            logits = model(**tokenized_inputs).logits
        for inst_logits in logits:
            predicted_class_id = inst_logits.argmax(dim=-1).item()
            #print (lang, "predicted Emotion")
            lang_pred_list.append(model.config.id2label[predicted_class_id])
        pred_list.append(lang_pred_list)

    true_false = []
    for en_pred, tgt_pred in zip(pred_list[0], pred_list[1]):
        #print (en_pred, ja_pred)
        if en_pred != tgt_pred:
            true_false.append(False)
        else:
            true_false.append(True)

    lang_accuracies[tgt_lang] = true_false.count(True)/len(true_false)
lang_accuracies 

Using custom data configuration default-17fa6efc908acb86
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-17fa6efc908acb86/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
ja


Using custom data configuration default-e4b60b9fd844da79
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-e4b60b9fd844da79/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
de


Using custom data configuration default-911eedd4b6eede3d


Downloading and preparing dataset json/default to /home/sumire/.cache/huggingface/datasets/json/default-911eedd4b6eede3d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/sumire/.cache/huggingface/datasets/json/default-911eedd4b6eede3d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

en
fr


Using custom data configuration default-cc3d46cbb1e70fd4


Downloading and preparing dataset json/default to /home/sumire/.cache/huggingface/datasets/json/default-cc3d46cbb1e70fd4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/sumire/.cache/huggingface/datasets/json/default-cc3d46cbb1e70fd4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

en
zh


Using custom data configuration default-94e223f2f26d1b2a
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-94e223f2f26d1b2a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

en
ar


Using custom data configuration default-3a9183ebb6967fe8


Downloading and preparing dataset json/default to /home/sumire/.cache/huggingface/datasets/json/default-3a9183ebb6967fe8/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/sumire/.cache/huggingface/datasets/json/default-3a9183ebb6967fe8/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

en
ko


{'ja': 0.6475977653631285,
 'de': 0.6654374790479383,
 'fr': 0.6930780235501266,
 'zh': 0.6496180670873464,
 'ar': 0.5569396979384853,
 'ko': 0.6288636869023794}

In [None]:
# Inference
with torch.no_grad():
    logits = model(**tokenized_inputs).logits
for sent_logits in logits:
    predicted_class_id = sent_logits.argmax(dim=-1).item()
    

    

In [16]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [17]:
def compute_metrics(eval_preds):
    
    preds, labels, input_ids = eval_preds
    
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = {}
    result["accuracy"] = accuracy.compute(predictions=predictions, references=labels)
    result["f1"] = f1.compute(predictions=predictions, references=labels)
    print (result)
    return result

In [32]:
output_dir="./results/xlm-emo"
training_args = TrainingArguments(
    output_dir=output_dir,
    #learning_rate=2e-5,
    #per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    #num_train_epochs=2,
    #weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    #train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

eval_preds = trainer.predict(tokenized_dataset["test"])
inputs = [sent for doc in dataset["test"]["doc"] for sent in doc["en"]]


with open(output_dir+'/emotion_pred.txt','w', encoding='utf8') as wf:
    for ip, pred in zip(inputs, eval_preds[0]):
        predicted_class_id = pred.argmax(axis=-1).item()
        #print (lang, "predicted Emotion")
        label = model.config.id2label[predicted_class_id]
        wf.write(ip.strip()+', ')
        wf.write(label +'\n')


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 50
  Batch size = 128
