In [1]:
!pip install transformers
!pip install huggingface_hub
!pip install datasets
!pip install sentencepiece
!pip install accelerate
!pip install wandb
!pip install nlp
!pip install evaluate
!pip install seqeval

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

In [3]:
import nlp

In [4]:
data = load_dataset('lince','lid_hineng')

Downloading builder script:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/433k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4823 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/744 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1854 [00:00<?, ? examples/s]

In [5]:
for split in data:
  print(split, len(data[split]))

train 4823
validation 744
test 1854


In [6]:
from transformers import XLMRobertaForTokenClassification

In [7]:
data['train'][2]

{'idx': 2,
 'words': ['Angry', 'young', 'men', 'sir', 'ji'],
 'lid': ['lang1', 'lang1', 'lang1', 'lang1', 'lang2']}

In [8]:
def preprocess_function(examples):
  ids = {'lang1':0,'lang2':1,'other':2,'ne':3,
         'fw':4,'mixed':5,'ambiguous':6,'unk':7,'':8}
  lid_ids = []
  for id in examples['lid']:
    lid_ids.append(ids[id])
  examples['lid_ids'] = lid_ids
  return examples

In [9]:
processed_data = data.map(
    preprocess_function
    # batched=True,
    # remove_columns=data["train"].column_names,
    # num_proc=4
)

Map:   0%|          | 0/4823 [00:00<?, ? examples/s]

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/1854 [00:00<?, ? examples/s]

In [10]:
processed_data['validation'][0]

{'idx': 0,
 'words': ['@ZahirJ',
  '@BinyavangaW',
  'Loved',
  'the',
  'ending',
  '!',
  'I',
  'could',
  'have',
  'offered',
  'you',
  'some',
  'ironic',
  'chai-tea',
  'for',
  'it',
  ';)'],
 'lid': ['other',
  'other',
  'lang1',
  'lang1',
  'lang1',
  'other',
  'lang1',
  'lang1',
  'lang1',
  'lang1',
  'lang1',
  'lang1',
  'lang1',
  'mixed',
  'lang1',
  'lang1',
  'other'],
 'lid_ids': [2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 2]}

In [11]:
processed_data['train'][0]

{'idx': 0,
 'words': ['Good', 'vibe', 'tribe', '.'],
 'lid': ['lang1', 'lang1', 'lang1', 'other'],
 'lid_ids': [0, 0, 0, 2]}

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("likhithasapu/gcm-xlmr-v2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [13]:
example = processed_data["train"][0]

tokenized_input = tokenizer(example["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokenized_input,tokens

({'input_ids': [0, 18621, 279, 372, 1927, 372, 6, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]},
 ['<s>', '▁Good', '▁vi', 'be', '▁tri', 'be', '▁', '.', '</s>'])

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"lid_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)



    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
tokenized_datasets = processed_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=processed_data["train"].column_names,
)

Map:   0%|          | 0/4823 [00:00<?, ? examples/s]

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/1854 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [17]:
data_collator

DataCollatorForTokenClassification(tokenizer=XLMRobertaTokenizerFast(name_or_path='likhithasapu/gcm-xlmr-v2', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '</s>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [18]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [19]:
example

{'idx': 0,
 'words': ['Good', 'vibe', 'tribe', '.'],
 'lid': ['lang1', 'lang1', 'lang1', 'other'],
 'lid_ids': [0, 0, 0, 2]}

In [20]:
import numpy as np
label_names = [
    "lang1",
    "lang2",
    "other",
    "ne",
    "fw",
    "mixed",
    "ambiguous",
    "unk",
    "",
]

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [21]:
id2label = {
    0: "lang1",
    1: "lang2",
    2: "other",
    3: "ne",
    4: "fw",
    5: "mixed",
    6: "ambiguous",
    7: "unk",
    8: "",
}
label2id = {
    "lang1":0,
    "lang2":1,
    "other":2,
    "ne":3,
    "fw":4,
    "mixed":5,
    "ambiguous":6,
    "unk":7,
    "":8,
}

In [22]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "likhithasapu/gcm-xlmr-v2", num_labels=9, id2label=id2label, label2id=label2id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at likhithasapu/gcm-xlmr-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from huggingface_hub import login
# from huggingface_hub import notebook_login
# notebook_login()
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [24]:
training_args = TrainingArguments(
    output_dir="gcm-xlmr-lid",
    learning_rate=2e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [25]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.2719,0.116816
2,0.116,0.108055
3,0.0977,0.108249
4,0.0669,0.116485
5,0.0379,0.120706
6,0.0281,0.138476
7,0.0224,0.141025
8,0.0192,0.147487
9,0.0136,0.167478
10,0.0088,0.173012


TrainOutput(global_step=12060, training_loss=0.03275938761508584, metrics={'train_runtime': 3318.0128, 'train_samples_per_second': 29.072, 'train_steps_per_second': 3.635, 'total_flos': 3867360694766226.0, 'train_loss': 0.03275938761508584, 'epoch': 20.0})

In [26]:
tokenizer.push_to_hub("gcm-xlmr-lid")
model.push_to_hub("gcm-xlmr-lid")

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/likhithasapu/gcm-xlmr-lid/commit/775efcdbc99e610e4bad96ee8c4cdd0e5b0f291d', commit_message='Upload XLMRobertaForTokenClassification', commit_description='', oid='775efcdbc99e610e4bad96ee8c4cdd0e5b0f291d', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
tokenized_datasets["train"]['input_ids'][0]

[0, 18621, 279, 372, 1927, 372, 6, 5, 2]

In [28]:
tokenized_datasets["train"]['labels'][0]

[-100, 0, 0, -100, 0, -100, 2, -100, -100]