In [None]:
pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset

In [None]:
pip install sentencepiece

In [None]:
text_classification_dataset = load_dataset("rotten_tomatoes")
from transformers import AutoTokenizer

model_name = "microsoft/deberta_v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
def preprocessing_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [None]:
import transformers import DataCollatorsWithPadding
data_collator = DataCollatorsWithPadding(tokenizer = tokenizer) #for batches of data collection

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
tokenized_classification = text_classification_dataset.map(preprocess_function, batched=True)

In [None]:
tokenizer.convert_ids_to_tokens(1)
tokenizer.convert_ids_to_tokens(2)
tokenized_classification["train"]["input_ids"]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model= AutoModelForSequence.from_pretrained(model_name, num_labels=np.unique(text_classification_dataset["train"]["labels"].shape[0]))

training_args = TrainingArguments(output_dir="my_deberta_review_model",
                                   learning_rate = 2e-5
                                   per_device_train_batch_size = 16,
                                   per_device_eval_batch_size =16,
                                   num_train_epochs =1,
                                   weight_decay =0.01,
                                   evaluation_strategy = "epoch",
                                   save_strategy = "epoch",
                                   load_best_model_at_end = True

)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_classification["train"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
#script to test the model

In [None]:
import torch
text = "the movie was no terrible. i really hate it"
inputs = tokenizer(text, return_tensors = "pt").to("cuda")
with torch.no_grad():
    logits = model(**inputs).logits

    logit.shape
    logits

    predicted_class_id = logits.argmax().item()
    model.config.id2label[predicted_class_id]

In [None]:
from torch import nn
def init_normal(m):
    if type(m)== Linear:
        nn.init.xavier_normal_(m.weight)
        # use the modules apply function to recursively apply the initialization
        model.apply(init_normal)

TOKEN CLASSIFICATION TASK

In [None]:

token_classification_dataset = load_dataset("conll2003")
token_classification_dataset["train"]
target_col = "ner_tags" # or "pos_tag"
label_list = token_classification_dataset["train"].features[target_col].feature.names
tokenized_input = tokenizer(token_classification_dataset["train"]["tokens"], is_split_into_words = True)


In [None]:
def tokenize_and_align_labels (examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation = True, is_split_into_words= True)
    labels = []
    for i, label in enumerate (examples [target_col]):
        word_ids = tokenized_inputd.word_ids(batch_index=i)
        #map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            #set the special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx !=previous_word_idx:
                #only label the first token of a given word
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"]= labels
    return tokenized_inputs

In [None]:
tokenized_inputs = token_classification_dataset.map(tokenize_and_align_labels, batched = True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
pip install Seqeval
import evaluate
seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np
labels = label_list
def compute_metrics(p):
    predictions, labels =p
    predictions = np.argmax(predictions, axis=1)
    true_predictions = [(label_list[p] for (p,1) in zip(prediction, label) if 1 !=-100] for prediction, label in zip(predictions, labels))]
    return {"precison": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
           }

In [None]:
id2label = {num:label for num, label in enumerate(label_list)}
label1id = {label: num, label in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained( model_name,
                                                        num_labels = len(label_list),
                                                        id2label = id2label,
                                                        label2id = label2id,

)

In [None]:
training_args = TrainingArguments(output_dir="my_deberta_review_model",
                                   learning_rate = 2e-5
                                   per_device_train_batch_size = 16,
                                   per_device_eval_batch_size =16,
                                   num_train_epochs =1,
                                   weight_decay =0.01,
                                   evaluation_strategy = "epoch",
                                   save_strategy = "epoch",
                                   load_best_model_at_end = True

)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_classification["train"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

Script to evaluate the model

In [None]:
import torch
text "some input I want my model to part of speech tag"
inputs = tokenizer (text, return_tensors = "pt").to("cuda")

with torch.no_grad():
    logits = model(**input).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.items()] for t in predictions[0]]
    for text, pred_class in zip(inputs.tokens(), predicted_token_class):
        print(text, pred_class)
        logits.shape