1.Lets install and invoke packages.

In [1]:
!pip install flair
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertForTokenClassification
import requests,os
from flair.datasets import ColumnCorpus
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print(os.getenv("PYTORCH_CUDA_ALLOC_CONF"))



  from .autonotebook import tqdm as notebook_tqdm


expandable_segments:True


2-Loading tokenizers, models, etc

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=9)
train_url = 'https://raw.githubusercontent.com/lang-uk/flair-ner/main/fixed-split/train.iob'
test_url = 'https://raw.githubusercontent.com/lang-uk/flair-ner/main/fixed-split/test.iob'

def download_data(url, file_name):
    if not os.path.exists(file_name):
        response = requests.get(url)
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f'Data downloaded: {file_name}')
    else:
        print('Data found:', file_name)
data_folder = './data/'
os.makedirs(data_folder, exist_ok=True)
download_data(train_url, os.path.join(data_folder, 'train.iob'))
download_data(test_url, os.path.join(data_folder, 'test.iob'))

distilbert_embeddings = TransformerWordEmbeddings('distilbert-base-multilingual-cased', fine_tune=True)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data found: ./data/train.iob
Data found: ./data/test.iob


Lets make corpus

In [3]:
columns = {0: 'text', 1: 'ner'}
corpus = ColumnCorpus(data_folder, columns,
                      train_file='train.iob',
                      test_file='test.iob')
print(corpus)
for i in range(15):
    print(corpus.train[i])
tag_types = corpus.make_label_dictionary(label_type='ner')
print(tag_types)




2024-09-07 18:45:36,204 Reading data from data
2024-09-07 18:45:36,218 Train: data/train.iob
2024-09-07 18:45:36,219 Dev: None
2024-09-07 18:45:36,220 Test: data/test.iob
2024-09-07 18:45:39,069 No dev split found. Using 10% (i.e. 876 samples) of the train split as dev data
Corpus: 7886 train + 876 dev + 4045 test sentences
Sentence[18]: "Зрозуміло , що український бізнес почав використовувати КСВ як інструмент формування своєї репутації буквально декілька років тому ."
Sentence[22]: "З одного боку , саме через це більшість проектів КСВ здійснюються епізодично та деколи виглядають , радше , як просто благодійність ."
Sentence[8]: "Винятком будуть хіба що представництва іноземних корпорацій ."
Sentence[20]: "З іншого боку , для українських компаній відкривається потужне « вікно можливостей » , щоб втілювати власні унікальні стратегії ."
Sentence[3]: "Філантропія як пріоритет"
Sentence[15]: "Це може бути як власна ініціатива , так і відповідь на запит від громади ."
Sentence[17]: "Це абс

7it [00:00, 92.10it/s]
7886it [00:00, 31743.44it/s]

2024-09-07 18:45:39,477 Dictionary created for label 'ner' with 4 values: PERS (seen 2419 times), LOC (seen 1063 times), ORG (seen 471 times), MISC (seen 373 times)
Dictionary with 4 tags: PERS, LOC, ORG, MISC





Lets make tagger and trainer

In [4]:
tagger = SequenceTagger(hidden_size=128,
                        embeddings=distilbert_embeddings,
                        tag_dictionary=tag_types,
                        tag_type='ner',
                        use_crf=False)
trainer = ModelTrainer(tagger, corpus)


2024-09-07 18:45:39,486 SequenceTagger predicts: Dictionary with 17 tags: O, S-PERS, B-PERS, E-PERS, I-PERS, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG, S-MISC, B-MISC, E-MISC, I-MISC


Model name and and model

In [5]:
finetuned_model_name='dist-finetuned.04'


In [6]:
trainer.train(finetuned_model_name,
              learning_rate=0.01,
              mini_batch_size=1,
              max_epochs=10,
              embeddings_storage_mode='cpu',
              #main_evaluation_metric='f1',
              monitor_test=True,
              #use_amp=True,
              train_with_dev=True) 

2024-09-07 18:45:44,683 ----------------------------------------------------------------------------------------------------
2024-09-07 18:45:44,685 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(119548, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_featu

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-09-07 18:46:44,491 epoch 1 - iter 876/8762 - loss 0.34136188 - time (sec): 59.79 - samples/sec: 271.07 - lr: 0.010000 - momentum: 0.000000
2024-09-07 18:47:19,976 ----------------------------------------------------------------------------------------------------
2024-09-07 18:47:19,978 Exiting from training early.
2024-09-07 18:47:19,978 Saving model ...
2024-09-07 18:47:25,202 Done.
2024-09-07 18:47:25,225 ----------------------------------------------------------------------------------------------------
2024-09-07 18:47:25,227 Testing using last state of model ...


  8%|▊         | 5/64 [00:06<01:04,  1.10s/it]

Lets evaluate it

In [None]:
result = tagger.evaluate(corpus.test, gold_label_type='ner')  # Evaluate on the test set
print(result.detailed_results)

100%|██████████| 127/127 [00:50<00:00,  2.52it/s]


Results:
- F-score (micro) 0.737
- F-score (macro) 0.4779
- Accuracy 0.6374

By class:
              precision    recall  f1-score   support

        PERS     0.8565    0.9321    0.8927      1678
         LOC     0.5123    0.6758    0.5828       401
         ORG     0.3110    0.3372    0.3235       261
        MISC     0.3636    0.0667    0.1127       240

   micro avg     0.7230    0.7516    0.7370      2580
   macro avg     0.5108    0.5029    0.4779      2580
weighted avg     0.7020    0.7516    0.7144      2580






Lets test with own data

In [None]:
finetuned_model = SequenceTagger.load('/home/sginne/src/master_thesis/colab/dist-finetuned.04/final-model.pt')
def print_tagged_sentences(sentences, model=None):
    # Load model only if not already loaded
    
    for text in sentences:
        sentence = Sentence(text)  # Create Sentence object
        model.predict(sentence)    # Predict tags
        #print (sentence.to_tagged_string())
        tagged_sentence = []
        spans=sentence.get_spans('ner')
    
        #for span in spans:
        #    print ("Span:",span.get_label('ner').value)
        #    print ("Span:",span.get_label('ner').score)
        i=0
        for token in sentence:
            if i==len(spans):
                #print ('BREAK',token.text)
                break
            #print (token.get_label('ner').value,spans[i][0].get_label('ner'))
            if token==spans[i][0]:
                current_tag_token=spans[i]
                current_tag_value=current_tag_token.get_label('ner').value
                current_tag_score=current_tag_token.get_label('ner').score
                tagged_sentence.append(f'[{token.text}|({current_tag_value}={current_tag_score*100:.1f}%)]')
                i=i+1
            else:
                tagged_sentence.append(token.text)
        print ( ' '.join(tagged_sentence))

sentences = ["Привіт, мене звати Тімо і я живу в Турку, навчаюсь в Универсітеті Турку.",
"Цікаву і пізнавальну подорож місцями , де народився та виріс видатний український поет Тарас Шевченко , минулого тижня влаштував департамент культури і туризму Кіровоградської ОДА ."]
print_tagged_sentences(sentences, finetuned_model)

2024-09-07 17:14:24,452 SequenceTagger predicts: Dictionary with 17 tags: O, S-PERS, B-PERS, E-PERS, I-PERS, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG, S-MISC, B-MISC, E-MISC, I-MISC
