In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [37]:
pip install -q transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [38]:
dataset = pd.read_csv("/kaggle/input/postagger/POS.csv")

In [39]:
import datasets
datasets.__version__

'4.1.1'

In [40]:
dataset.head()

Unnamed: 0,sentence_id,words,labels
0,0,﻿६१,CD
1,0,वर्षीय,JJ
2,0,पियरे,NNP
3,0,भिन्केन,NNP
4,0,नोभेम्बर,NNP


In [41]:
label_list = dataset['labels'].unique().tolist()
print(len(label_list))

39


In [42]:
label2ids = {label:i for i,label in enumerate(label_list)}
ids2label = {values:keys for keys,values in label2ids.items()}

In [43]:
print(label2ids)
print(ids2label)

{'CD': 0, 'JJ': 1, 'NNP': 2, 'POP': 3, 'NN': 4, 'PKO': 5, 'VBX': 6, 'YF': 7, 'FB': 8, 'VBF': 9, 'PLAI': 10, 'DUM': 11, 'VBKO': 12, 'RBO': 13, 'VBI': 14, 'VBO': 15, 'HRU': 16, 'JJD': 17, 'YM': 18, 'PLE': 19, 'JJM': 20, 'RP': 21, 'VBNE': 22, 'CS': 23, 'YQ': 24, 'CL': 25, 'PP': 26, 'PP$': 27, 'CC': 28, 'SYM': 29, 'PPR': 30, 'DM': 31, 'OD': 32, 'QW': 33, 'UNW': 34, 'RBM': 35, 'FW': 36, 'YB': 37, 'ALPH': 38}
{0: 'CD', 1: 'JJ', 2: 'NNP', 3: 'POP', 4: 'NN', 5: 'PKO', 6: 'VBX', 7: 'YF', 8: 'FB', 9: 'VBF', 10: 'PLAI', 11: 'DUM', 12: 'VBKO', 13: 'RBO', 14: 'VBI', 15: 'VBO', 16: 'HRU', 17: 'JJD', 18: 'YM', 19: 'PLE', 20: 'JJM', 21: 'RP', 22: 'VBNE', 23: 'CS', 24: 'YQ', 25: 'CL', 26: 'PP', 27: 'PP$', 28: 'CC', 29: 'SYM', 30: 'PPR', 31: 'DM', 32: 'OD', 33: 'QW', 34: 'UNW', 35: 'RBM', 36: 'FW', 37: 'YB', 38: 'ALPH'}


In [44]:
dataset.isna().sum()

sentence_id    0
words          2
labels         0
dtype: int64

In [45]:
index = dataset[dataset["words"].isna()].index.tolist()
print(index)

[61652, 73919]


In [46]:
dataset = dataset.drop(index=index)

In [47]:
print(len(dataset["sentence_id"].unique()))

4251


In [48]:
grouped = dataset.groupby("sentence_id")
print(len(grouped["words"]))

4251


In [49]:
df_grouped = grouped.agg({
    "words":lambda x:list(x),
    "labels":lambda x:list(x)
})

In [50]:
df_grouped.columns

Index(['words', 'labels'], dtype='object')

In [51]:
train,test = train_test_split(
    df_grouped,
    test_size = 0.2,
    shuffle = True
)

In [52]:
train.reset_index(drop=True,inplace=True)

In [53]:
train.head()

Unnamed: 0,words,labels
0,"[पेशागत, सुरक्षा, र, स्वास्थ्य, प्रशासन, का, प...","[JJ, NN, CC, NN, NN, PKO, JJ, NNP, NNP, PLE, V..."
1,"[टाइनी, टोट्स, इन्कर्पोरेसन, ,, क्याम्पबेल, ,,...","[NNP, NNP, NN, YM, NNP, YM, NNP, JJ, NN, PKO, ..."
2,"[गत, वर्ष, अभिग्रहण, गरेको, क्राउन, प्रकाशन, स...","[JJ, NN, NN, VBKO, NNP, NN, NN, JJ, NN, POP, J..."
3,"[प्याकेजिङ, का, केही, बेफाइदा, छन्, ।]","[NN, PKO, DUM, NN, VBX, YF]"
4,"[किन, यी, विशाल, काल्पनिक, बिक्रेता, न्युयोर्क...","[QW, DUM, JJ, JJ, NN, NNP, PKO, NN, NNP, POP, ..."


In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")


In [55]:
print(train['words'].dtype)


object


In [56]:
print(type(train["words"]))

<class 'pandas.core.series.Series'>


In [57]:
tokenized_input = tokenizer(train.iloc[0]["words"],is_split_into_words=True)

In [58]:
print(tokenized_input.keys())

KeysView({'input_ids': [2, 388, 2025, 2047, 1807, 1034, 2592, 394, 16210, 1010, 1041, 1022, 1996, 5501, 1701, 6114, 1053, 3679, 1007, 1051, 2335, 6295, 1024, 396, 8198, 1030, 12514, 16, 393, 3076, 7517, 1010, 1996, 8915, 1064, 1034, 2699, 3236, 7035, 1047, 1063, 384, 26122, 10431, 1723, 1807, 1034, 2592, 1803, 16210, 1010, 1041, 1022, 1733, 8915, 2700, 4555, 6909, 2699, 25490, 3056, 373, 16, 68, 68, 10765, 5982, 3236, 4892, 1030, 2162, 12316, 1034, 6340, 3172, 1030, 12165, 14050, 1955, 2650, 428, 601, 601, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [59]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

In [60]:
word_ids = tokenized_input.word_ids(batch_index=0) 
print(word_ids)

[None, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 3, 4, 4, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9, 10, 10, 10, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 15, 16, 16, 16, 17, 17, 18, 19, 20, 20, 20, 21, 22, 22, 22, 22, 23, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 32, 33, 34, 35, 35, 36, 36, 36, 37, 38, 38, 39, 39, 40, 41, 42, 43, 43, None]


In [61]:
print(tokens)

['[CLS]', 'प', '##शा', '##गत', 'सर', '##क', '##षा', 'र', 'सवा', '##स', '##थ', '##य', 'पर', '##शासन', 'का', 'परम', '##ख', 'गरा', '##र', '##ड', 'सक', '##यान', '##ल', 'ल', 'भन', '##न', '##भयो', ',', 'य', '##एस', '##एक', '##स', 'पर', '##बन', '##ध', '##क', 'हर', 'लाई', 'वर', '##ष', '##ौ', 'द', '##खि', 'कारखाना', 'मा', 'सर', '##क', '##षा', 'तथा', 'सवा', '##स', '##थ', '##य', 'सम', '##बन', '##धी', 'धर', 'कमजोरी', 'हर', 'रहको', 'थाहा', 'छ', ',', '`', '`', 'तथापि', 'खतरा', 'लाई', 'रोक', '##न', 'आव', '##शय', '##क', 'कदम', 'चाल', '##न', 'असम', '##रथ', 'भएका', 'छन', '।', '‘', '‘', '[SEP]']


In [62]:
from datasets import Dataset
dataset_train = Dataset.from_pandas(train)
dataset_test = Dataset.from_pandas(test)

In [63]:
print(type(dataset_train['words']))              # datasets.arrow_dataset.Column
print(type(dataset_train['words'][0]))           # list
print(type(dataset_train['words'][0][0]))        # str
print(type(dataset_train['labels'][0]))          # list
print(type(dataset_train['labels'][0][0]))       # str or int depending on mapping


<class 'datasets.arrow_dataset.Column'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'str'>


In [64]:
tokenized_input = tokenizer(dataset_train["words"][:100],is_split_into_words=True)
tokenized_input = tokenizer(dataset_test["words"][:100],is_split_into_words=True)

In [65]:
# print(dataset_train['words'][0])
# for i,data in enumerate(dataset_train['labels'][0]):
#     print(f"{i}:{data}")
#     print(label2ids[data])
    
 

In [66]:
def tokenize_and_align_labels(examples):
    tokenize_inputs = tokenizer(examples['words'][:],truncation=True,max_length = 512,is_split_into_words=True)
    labels = []

    for i,label in enumerate(examples['labels'][:]):
        word_ids = tokenize_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2ids[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenize_inputs['labels'] = labels
    return tokenize_inputs
                
        
    

In [67]:

tokenized_inputs = dataset_train.map(tokenize_and_align_labels,batched=True,batch_size=100)
tokenized_tests = dataset_test.map(tokenize_and_align_labels,batched=True,batch_size=100)

Map:   0%|          | 0/3400 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [68]:
from transformers import DataCollatorForTokenClassification

datacollator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [72]:
# pip install -q evaluate
!pip install -q seqeval


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=2f28b1cef4b25755e50a0f0d88fd3e94ef18da48e6d5f2bb721d036d455b371f
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [73]:
import evaluate

seqeval = evaluate.load("seqeval")

Now creating a function that passes predicition and true label to compute metrics


In [74]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p    
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [ ids2label[p] for (p,l) in zip(prediction, label) if l != -100 ]
        for prediction,label in zip(predictions,labels) 
    ]

    true_label = [
        [ ids2label[l] for (p,l) in zip(prediction, label) if l != -100 ]
        for prediction,label in zip(predictions,labels) 
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_label)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [75]:
tokenized_inputs.features

{'words': List(Value('string')),
 'labels': List(Value('int64')),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8'))}

In [76]:
import torch
def cleaner(tokenized_inputs):
    tokenized_inputs = tokenized_inputs.remove_columns(['words'])
    tokenized_inputs.set_format("torch")
    return tokenized_inputs

tokenized_inputs = cleaner(tokenized_inputs)
tokenized_tests = cleaner(tokenized_tests)

In [77]:
tokenized_inputs.features

{'labels': List(Value('int64')),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8'))}

In [78]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_inputs, shuffle=True, batch_size=16, collate_fn=datacollator)
test_dataloader = DataLoader(tokenized_tests, batch_size=16, collate_fn=datacollator)



In [80]:
for batch in train_dataloader:
    break
{k:v.shape for k,v in batch.items()}

{'input_ids': torch.Size([16, 83]),
 'token_type_ids': torch.Size([16, 83]),
 'attention_mask': torch.Size([16, 83]),
 'labels': torch.Size([16, 83])}