In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# pip install -q transformers

In [3]:
dataset = pd.read_csv("/kaggle/input/postagger/POS.csv")

In [4]:
import datasets
datasets.__version__

'4.1.1'

In [5]:
dataset.head()

Unnamed: 0,sentence_id,words,labels
0,0,﻿६१,CD
1,0,वर्षीय,JJ
2,0,पियरे,NNP
3,0,भिन्केन,NNP
4,0,नोभेम्बर,NNP


In [6]:
label_list = dataset['labels'].unique().tolist()
print(len(label_list))

39


In [7]:
label2ids = {label:i for i,label in enumerate(label_list)}
ids2label = {values:keys for keys,values in label2ids.items()}

In [8]:
print(label2ids)
print(ids2label)

{'CD': 0, 'JJ': 1, 'NNP': 2, 'POP': 3, 'NN': 4, 'PKO': 5, 'VBX': 6, 'YF': 7, 'FB': 8, 'VBF': 9, 'PLAI': 10, 'DUM': 11, 'VBKO': 12, 'RBO': 13, 'VBI': 14, 'VBO': 15, 'HRU': 16, 'JJD': 17, 'YM': 18, 'PLE': 19, 'JJM': 20, 'RP': 21, 'VBNE': 22, 'CS': 23, 'YQ': 24, 'CL': 25, 'PP': 26, 'PP$': 27, 'CC': 28, 'SYM': 29, 'PPR': 30, 'DM': 31, 'OD': 32, 'QW': 33, 'UNW': 34, 'RBM': 35, 'FW': 36, 'YB': 37, 'ALPH': 38}
{0: 'CD', 1: 'JJ', 2: 'NNP', 3: 'POP', 4: 'NN', 5: 'PKO', 6: 'VBX', 7: 'YF', 8: 'FB', 9: 'VBF', 10: 'PLAI', 11: 'DUM', 12: 'VBKO', 13: 'RBO', 14: 'VBI', 15: 'VBO', 16: 'HRU', 17: 'JJD', 18: 'YM', 19: 'PLE', 20: 'JJM', 21: 'RP', 22: 'VBNE', 23: 'CS', 24: 'YQ', 25: 'CL', 26: 'PP', 27: 'PP$', 28: 'CC', 29: 'SYM', 30: 'PPR', 31: 'DM', 32: 'OD', 33: 'QW', 34: 'UNW', 35: 'RBM', 36: 'FW', 37: 'YB', 38: 'ALPH'}


In [9]:
dataset.isna().sum()

sentence_id    0
words          2
labels         0
dtype: int64

In [10]:
index = dataset[dataset["words"].isna()].index.tolist()
print(index)

[61652, 73919]


In [11]:
dataset = dataset.drop(index=index)

In [12]:
print(len(dataset["sentence_id"].unique()))

4251


In [13]:
grouped = dataset.groupby("sentence_id")
print(len(grouped["words"]))

4251


In [14]:
df_grouped = grouped.agg({
    "words":lambda x:list(x),
    "labels":lambda x:list(x)
})

In [15]:
df_grouped.columns

Index(['words', 'labels'], dtype='object')

In [16]:
train,test = train_test_split(
    df_grouped,
    test_size = 0.2,
    shuffle = True
)

In [17]:
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [18]:
train.head()

Unnamed: 0,words,labels
0,"[सर्वसाधारण, मानिस, हरू, बजार, मा, खेल, खेल्न,...","[JJ, NN, HRU, NN, POP, NN, VBI, VBF, YM, PP, H..."
1,"[कारखाना, दुर्घटना, र, अन्य, अनपेक्षित, खर्च, ...","[NN, NN, CC, DUM, JJ, NN, POP, JJ, NNP, NN, PL..."
2,"[कम्पनी, ले, एक, पटक, को, दायित्व, आकलन, ले, ए...","[NN, PLE, CD, RBO, PKO, NN, NN, PLE, NN, PKO, ..."
3,"[पेन्सिल्भानिया, बैंक, ले, युनिभेस्ट, कर्पोरेस...","[NNP, NN, PLE, NNP, NN, POP, PKO, CD, POP, JJ,..."
4,"[यस, ले, जुलाई, मा, साइता, मा, उच्च, अधिकृत, क...","[DUM, PLE, NNP, POP, NN, POP, JJD, NN, PKO, NN..."


In [19]:
test.head()

Unnamed: 0,words,labels
0,"[काङ्‍ग्रेस, र, बुस, ले, तर्जुमा, गरेको, न्युन...","[NN, CC, NNP, PLE, NN, VBKO, JJD, NN, PKO, NN,..."
1,"[दिउँसो, अबेर, को, न्युयोर्क, कारोबार, मा, सो,...","[NN, RBO, PKO, NNP, NN, POP, DUM, NN, CD, NNP,..."
2,"[``, हामी, यो, योजना, प्रस्ताव, गरिरहेका, छौँ,...","[YQ, PP, DUM, NN, NN, VBKO, VBF, CS, PP, PLAI,..."
3,"[यस, को, अतिरिक्त, ,, उल्लेखनीय, रूप, मा, कम, ...","[DUM, PKO, JJ, YM, JJ, NN, POP, JJ, NNP, NN, N..."
4,"[कम्पनी, ले, समय-अगावै, को, अवकाश, कार्यक्रम, ...","[NN, PLE, NN, PKO, NN, NN, PKO, POP, NN, NN, N..."


In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")


In [21]:
print(train['words'].dtype)


object


In [22]:
print(type(train["words"]))

<class 'pandas.core.series.Series'>


In [23]:
tokenized_input = tokenizer(train.iloc[0]["words"],is_split_into_words=True)

In [24]:
print(tokenized_input.keys())

KeysView({'input_ids': [2, 1807, 23792, 1005, 2646, 2555, 2699, 2435, 1723, 9795, 9795, 1030, 12561, 1030, 31, 4625, 2699, 2218, 1010, 1701, 1774, 10023, 10831, 3127, 17200, 1723, 2013, 2335, 2522, 1030, 16, 605, 1791, 6114, 5844, 2047, 14774, 1005, 13957, 23792, 1718, 2377, 2918, 1028, 7835, 1030, 7254, 16, 2630, 1010, 2054, 2335, 14609, 23823, 1701, 1799, 2408, 1047, 362, 18, 7835, 1030, 7254, 27195, 428, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})


In [25]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

In [26]:
word_ids = tokenized_input.word_ids(batch_index=0) 
print(word_ids)

[None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 6, 7, 7, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 19, 19, 20, 21, 22, 23, 23, 23, 24, 24, 25, 25, 25, 25, 26, 26, 27, 27, 28, 29, 30, 30, 30, 31, 31, 31, 32, 33, 33, 33, 34, 34, 35, 35, 36, 37, 38, None]


In [27]:
print(tokens)

['[CLS]', 'सर', '##वस', '##ा', '##धारण', 'मानिस', 'हर', 'बजार', 'मा', 'खल', 'खल', '##न', 'आएन', '##न', ';', 'तिनी', 'हर', 'तय', '##स', 'का', 'लागि', 'चोर', '##बाटो', 'बाट', 'होड', 'मा', 'जान', 'सक', '##दछ', '##न', ',', '“', 'एक', 'परम', '##परा', '##गत', 'मदर', '##ा', 'वय', '##वस', '##था', '##पन', 'फर', '##म', 'बरि', '##न', 'मर', ',', 'फो', '##स', '##टर', 'सक', '##यरि', '##टिज', 'का', 'अध', '##यक', '##ष', 'ए', '.', 'बरि', '##न', 'मर', 'बताउछन', '।', '[SEP]']


In [28]:
from datasets import Dataset
dataset_train = Dataset.from_pandas(train)
dataset_test = Dataset.from_pandas(test)

In [29]:
print(type(dataset_train['words']))              # datasets.arrow_dataset.Column
print(type(dataset_train['words'][0]))           # list
print(type(dataset_train['words'][0][0]))        # str
print(type(dataset_train['labels'][0]))          # list
print(type(dataset_train['labels'][0][0]))       # str or int depending on mapping


<class 'datasets.arrow_dataset.Column'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'str'>


In [30]:
tokenized_input = tokenizer(dataset_train["words"][:100],is_split_into_words=True)
tokenized_input = tokenizer(dataset_test["words"][:100],is_split_into_words=True)

In [31]:
# print(dataset_train['words'][0])
# for i,data in enumerate(dataset_train['labels'][0]):
#     print(f"{i}:{data}")
#     print(label2ids[data])
    
 

In [57]:
def tokenize_and_align_labels(examples):
    tokenize_inputs = tokenizer(examples['words'],truncation=True,max_length = 512,is_split_into_words=True)
    labels = []

    for i,label in enumerate(examples['labels']):
        word_ids = tokenize_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2ids[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenize_inputs['labels'] = labels
    return tokenize_inputs
                
        
    

In [73]:

tokenized_inputs = dataset_train.map(tokenize_and_align_labels,batched=True,batch_size=100)
tokenized_tests = dataset_test.map(tokenize_and_align_labels,batched=True,batch_size=100)

Map:   0%|          | 0/3400 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [74]:
from transformers import DataCollatorForTokenClassification

datacollator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [75]:
# !pip install -q evaluate
# !pip install -q seqeval


In [76]:
import evaluate

seqeval = evaluate.load("seqeval")

Now creating a function that passes predicition and true label to compute metrics


In [77]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p    
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [ ids2label[p] for (p,l) in zip(prediction, label) if l != -100 ]
        for prediction,label in zip(predictions,labels) 
    ]

    true_label = [
        [ ids2label[l] for (p,l) in zip(prediction, label) if l != -100 ]
        for prediction,label in zip(predictions,labels) 
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_label)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [78]:
import torch
def cleaner(tokenized_inputs):
    tokenized_inputs = tokenized_inputs.remove_columns(['words'])
    tokenized_inputs.set_format("torch")
    return tokenized_inputs

tokenized_inputs = cleaner(tokenized_inputs)
tokenized_tests = cleaner(tokenized_tests)

In [79]:
tokenized_tests.features


{'labels': List(Value('int64')),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8'))}

In [80]:
tokenized_inputs.features

{'labels': List(Value('int64')),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8'))}

In [81]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_inputs, shuffle=True, batch_size=32, collate_fn=datacollator)
test_dataloader = DataLoader(tokenized_tests, batch_size=32, collate_fn=datacollator)



In [82]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}


{'input_ids': torch.Size([32, 113]),
 'token_type_ids': torch.Size([32, 113]),
 'attention_mask': torch.Size([32, 113]),
 'labels': torch.Size([32, 113])}

In [83]:
from transformers import AutoModelForTokenClassification
NUM_LABELS = len(label2ids)
model = AutoModelForTokenClassification.from_pretrained(
    "Shushant/nepaliBERT",
    num_labels=NUM_LABELS
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at Shushant/nepaliBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
output = model(**batch)
print(output.loss,output.logits.shape)


tensor(3.8433, grad_fn=<NllLossBackward0>) torch.Size([32, 113, 39])


In [85]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr=5e-5)

In [86]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [87]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr=5e-5)

In [88]:
from transformers import get_scheduler
epochs = 20
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
best_f1 = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        output = model(**batch)
        loss = output.loss
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    avg_loss = total_loss / len(train_dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

    # ----------------- Evaluation -----------------
    overall_results = {"precision": 0, "recall": 0, "f1": 0, "accuracy": 0, "count": 0}
    
    model.eval()
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        model_inputs = {k: v for k, v in batch.items() if k in ["input_ids", "attention_mask", "labels"]}
        with torch.no_grad():
            output = model(**model_inputs)

        logits = output.logits.detach().cpu().numpy()
        labels = batch["labels"].detach().cpu().numpy()
        batch_size = labels.shape[0]

        batch_results = compute_metrics((logits, labels))

        # update weighted average
        overall_results["precision"] += batch_results["precision"] * batch_size
        overall_results["recall"] += batch_results["recall"] * batch_size
        overall_results["f1"] += batch_results["f1"] * batch_size
        overall_results["accuracy"] += batch_results["accuracy"] * batch_size
        overall_results["count"] += batch_size

    # final average
    final_results = {k: v / overall_results["count"] for k, v in overall_results.items() if k != "count"}
    print(f"F1: {final_results['f1']:.4f} | Accuracy: {final_results['accuracy']:.4f}")
    
    # save best model
    if final_results['f1'] > best_f1:
        torch.save(model.state_dict(), "bestmodel.pt")
        best_f1 = final_results['f1']
        print(f"✅ New best model saved with F1: {best_f1:.4f}")


  0%|          | 0/2140 [00:00<?, ?it/s]

Training Loss: 0.1343


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1: 0.9367 | Accuracy: 0.9500
✅ New best model saved with F1: 0.9367
Training Loss: 0.1029
F1: 0.9366 | Accuracy: 0.9503
Training Loss: 0.0819
F1: 0.9375 | Accuracy: 0.9506
✅ New best model saved with F1: 0.9375
Training Loss: 0.0672
F1: 0.9376 | Accuracy: 0.9510
✅ New best model saved with F1: 0.9376
