In [None]:
!pip install transformers
!pip install pandas
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 6.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer,AutoModelForSequenceClassification,XLMRobertaTokenizer,XLMRobertaForSequenceClassification
from transformers import EarlyStoppingCallback

label_dict={"literally":0,"figuratively":1,"both":2,"undecidable":3}

def create_sentences_tags_labels(filename):
  _,_,labels,sentences=zip(*[line.split("\t") for line in open(filename).readlines()])
  print(len(open(filename).readlines()))

  labels=list(map(lambda x:label_dict[x.strip()],labels))
  tags_list=[]
  sentence_list=[]
  for sentence in sentences:
    words=sentence.strip().split()
    b_words=[i for i,word in enumerate(words) if word.startswith("<b>") and word.endswith("</b>")]
    tags=["O" for _ in range(len(words))]
    tags=tags[:b_words[0]]+["B" for i in range(b_words[0],b_words[-1]+1)]+tags[b_words[-1]+1:]
    tags_list.append(tags)
    sentence_list.append(" ".join(words).replace("<b>","").replace("</b>",""))
  print(len(sentence_list),len(tags_list),len(labels))
  return sentence_list,tags_list,labels


#"drive/MyDrive/vid-disambiguation-sharedtask-main/data/dev/dev.tsv")
test_sentences,test_tags,test_labels=create_sentences_tags_labels("test.tsv")
train_sentences,train_tags,train_labels=create_sentences_tags_labels("train.tsv")


tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")

def idiom_tokenize(tokenizer, sentences, tags_list):
  tokenized_sentences=tokenizer(sentences,padding=True,truncation=True,max_length=128)
  print(tokenized_sentences.keys())
  input_ids_list=tokenized_sentences["input_ids"]
  
  for i, (tags, input_ids, sentence) in enumerate(zip(tags_list, input_ids_list, sentences)):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    tokens = tokens[1:-1]
    new_ids=[]
    for tag, word in zip(tags, sentence.strip().split()):
      word=word.lower()
      while word != "" and len(tokens)>0:
        tokens[0] = tokens[0].replace("##","")
        word=word[len(tokens[0]):]
        new_ids.append(0 if tag=="O" else 1)
        tokens=tokens[1:]
      if len(tokens)==0:
        break
    tokenized_sentences["token_type_ids"][i]=[0]+new_ids+[0]+[0]*len(tokens)
  for i,j in zip(tokenized_sentences["input_ids"],tokenized_sentences["token_type_ids"]):
    assert len(i)==len(j)
  return tokenized_sentences

class IdiomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
tokenized_sentences_train=idiom_tokenize(tokenizer,train_sentences,train_tags)
tokenized_sentences_test=idiom_tokenize(tokenizer,test_sentences,test_tags)

train_dataset=IdiomDataset(tokenized_sentences_train,train_labels)
test_dataset=IdiomDataset(tokenized_sentences_test,test_labels)

model=AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-german-uncased",num_labels=4)

loading configuration file https://huggingface.co/dbmdz/bert-base-german-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dd6b21985a95123a88e7444ca271cc9429cf8c4ccbf84e5665ffebb3db0958ca.6ec690b98e01c56d26601258d2be34c3e5a76b949465ed58983cff81e5f9fa88
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-german-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type":

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall_0,recall_1,recall_2,recall_3 = recall_score(y_true=labels, y_pred=pred,average=None)
    precision_0,precision_1,precision_2,precision_3 = precision_score(y_true=labels, y_pred=pred,average=None)
    f1_0,f1_1,f1_2,f1_3 = f1_score(y_true=labels, y_pred=pred,average=None)

    return {"accuracy": accuracy, 
            "precision_0": precision_0,
            "precision_1": precision_1,
            "recall_0": recall_0,
            "recall_1": recall_1,
            "f1_0": f1_0,
            "f1_1": f1_1}

args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    save_total_limit = 1,
    learning_rate=1e-05,
    save_strategy="epoch"
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()
trainer.save_state()

***** Running training *****
  Num examples = 6902
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4315


Epoch,Training Loss,Validation Loss,Accuracy,Precision 0,Precision 1,Recall 0,Recall 1,F1 0,F1 1
1,0.384,0.291977,0.921906,0.851163,0.933642,0.690566,0.977383,0.7625,0.955012
2,0.1732,0.261098,0.931833,0.794326,0.963385,0.845283,0.956381,0.819013,0.95987
3,0.1098,0.361276,0.936466,0.826415,0.959872,0.826415,0.966074,0.826415,0.962963
4,0.0751,0.395383,0.936466,0.82397,0.960611,0.830189,0.965267,0.827068,0.962933
5,0.0566,0.404363,0.936466,0.819188,0.962097,0.837736,0.963651,0.828358,0.962873


***** Running Evaluation *****
  Num examples = 1511
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-863
Configuration saved in /content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-863/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-863/pytorch_model.bin
Deleting older checkpoint [/content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-4315] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1511
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-1726
Configuration saved in /content/drive/MyDrive/Colab Notebooks/prateek_models/output/checkpoint-1726/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/prateek