In [1]:
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset, load_dataset
import numpy as np
from typing import List
from transformers import DataCollatorWithPadding
import evaluate
import torch

In [58]:
tgt_lang = "de"
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
dataset = load_dataset("json", data_files=data_files)
dataset

Using custom data configuration default-2aff7b021f107e53
Found cached dataset json (/home/sumire/.cache/huggingface/datasets/json/default-2aff7b021f107e53/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['talk_id', 'doc'],
        num_rows: 93
    })
})

In [3]:
id2label ={0: "anger", 1: "fear", 2: "joy", 3: "sadness"}
label2id = {"anger" : 0, "fear" : 1, "joy": 2, "sadness": 3}

In [4]:
tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/xlm-emo-t")
model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/xlm-emo-t", num_labels=4, id2label=id2label, label2id=label2id)

In [8]:
def preprocess_function(data):
    inputs = [sent for doc in data["doc"] for sent in doc["en"]][:50]
    #inputs = [kshot + sent + ' = ' for doc in data["doc"] for sent in doc["en"] ][:50]
    return tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")

In [9]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names,)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [1]:
# Accuracies for each tgt_language 
tgt_lang_list = ["ja", "de", "fr", "zh", "ar", "ko"]
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

lang_data_dict = {}
for tgt_lang in lang_list:
    data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
    dataset[tgt_lang] = load_dataset("json", data_files=data_files)

    pred_list = []
    for lang in ["en", tgt_lang]:
        lang_pred_list = []
        print (lang)
        inputs = [sent for doc in dataset[tgt_lang]["test"]["doc"] for sent in doc[lang]]
        tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
        tokenized_inputs["input_ids"]
        with torch.no_grad():
            logits = model(**tokenized_inputs).logits
        for inst_logits in logits:
            predicted_class_id = inst_logits.argmax(dim=-1).item()
            #print (lang, "predicted Emotion")
            lang_pred_list.append(model.config.id2label[predicted_class_id])
        pred_list.append(lang_pred_list)

    true_false = []
    for en_pred, tgt_pred in zip(pred_list[0], pred_list[1]):
        #print (en_pred, ja_pred)
        if en_pred != tgt_pred:
            true_false.append(False)
        else:
            true_false.append(True)

    lang_accuracies[tgt_lang] = true_false.count(True)/len(true_false)
lang_accuracies 

NameError: name 'lang_list' is not defined

In [63]:
# Annotate the English data with the emotion label 

tgt_lang_list = ["ja", "de", "fr", "zh", "ar", "ko"]
file_path = "/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/"

lang_data_dict = {}
for tgt_lang in lang_list:
    data_files = { "test": f"{file_path}ted_en-{tgt_lang}"}
    dataset[tgt_lang] = load_dataset("json", data_files=data_files)

    pred_list = []
    for lang in ["en", tgt_lang]:
        lang_pred_list = []
        print (lang)
        inputs = [sent for doc in dataset[tgt_lang]["test"]["doc"] for sent in doc[lang]]
        tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
        tokenized_inputs["input_ids"]
        with torch.no_grad():
            logits = model(**tokenized_inputs).logits
        for inst_logits in logits:
            predicted_class_id = inst_logits.argmax(dim=-1).item()
            #print (lang, "predicted Emotion")
            lang_pred_list.append(model.config.id2label[predicted_class_id])
        pred_list.append(lang_pred_list)
        dataset[]
        
    true_false = []
    for en_pred, tgt_pred in zip(pred_list[0], pred_list[1]):
        #print (en_pred, ja_pred)
        if en_pred != tgt_pred:
            true_false.append(False)
        else:
            true_false.append(True)

    lang_accuracies[tgt_lang] = true_false.count(True)/len(true_false)
lang_accuracies 

{'ja': 0.44, 'ko': 0.58, 'de': 0.6, 'zh': 0.5, 'fr': 0.66, 'ar': 0.48}

In [36]:
# Inference
with torch.no_grad():
    logits = model(**tokenized_inputs).logits
for sent_logits in logits:
    predicted_class_id = sent_logits.argmax(dim=-1).item()
    print (model.config.id2label[predicted_class_id])

fear
sadness
fear
fear
fear
fear
fear
sadness
joy
joy
joy
anger
sadness
joy
joy
sadness
joy
anger
anger
anger
sadness
sadness
anger
anger
anger
joy
fear
fear
sadness
sadness
sadness
sadness
sadness
fear
sadness
anger
fear
joy
sadness
fear
joy
sadness
anger
sadness
sadness
joy
anger
joy
sadness
anger


In [33]:
def compute_metrics(eval_preds):
    print ("a")
    preds, labels, input_ids = eval_preds
    print (preds)
    with open(output_dir+'/translations.txt','w', encoding='utf8') as wf:
         for translation, ids in zip(decoded_preds, preds):
            wf.write(translation.strip()+'\n')
            wf.write(str(ids)+'\n')
    
    
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = {}
    result["accuracy"] = accuracy.compute(predictions=predictions, references=labels)
    result["f1"] = f1.compute(predictions=predictions, references=labels)
    print (result)
    return result

In [34]:
training_args = TrainingArguments(
    output_dir="./results/xlm-emo",
    #learning_rate=2e-5,
    #per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    #num_train_epochs=2,
    #weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    #train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.predict(tokenized_dataset["test"])

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 50
  Batch size = 128


PredictionOutput(predictions=array([[-7.3690170e-01,  1.9864393e+00, -5.0852078e-01, -1.0347528e+00],
       [-9.0651818e-02,  5.5569202e-01, -1.9681559e+00,  1.5604615e+00],
       [-1.4446754e+00,  1.8737437e+00, -8.5840929e-01,  2.7800968e-01],
       [-7.2376180e-01,  6.5672863e-01, -5.2121948e-03, -2.5962692e-01],
       [-6.9333893e-01, -4.2962697e-01,  1.2738094e+00, -2.0840783e-01],
       [-1.9882036e+00,  1.9470445e+00, -7.6120192e-01,  8.5731059e-01],
       [ 2.0553126e+00, -1.1165123e+00, -2.4368372e+00,  1.1474683e+00],
       [ 1.7952882e+00, -1.0295930e+00, -1.4687315e+00,  2.1809448e-01],
       [-1.8654697e+00, -2.2038984e+00,  5.4086227e+00, -9.5053875e-01],
       [ 5.9075296e-01, -1.9366342e+00,  2.4332862e+00, -1.4687426e+00],
       [-5.0563389e-01, -1.3342285e+00,  4.8962390e-01,  1.2540665e+00],
       [-6.4036348e-03, -8.5303217e-01, -1.6723143e+00,  2.6699271e+00],
       [ 4.9261442e-01, -9.9061167e-01,  3.9191538e-01, -2.5541106e-01],
       [ 1.2869259e+00