# Training Notebook for NER exercise

### Importing all necessary packages in the code

In [2]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

import torch
import spacy
import evaluate
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm

from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          DataCollatorForTokenClassification,
                          Trainer, 
                          TrainingArguments)
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
from IPython.display import Markdown

import warnings
warnings.filterwarnings('ignore')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


### Data-Preprocessing & EDA - all of them executed in glob code cell for all datasets

- quick data overview *(see function 'quick_eda')*

- Removing duplicates

- Removing rows where label's - start index and end index spill over text length or start index is greater than end index; *see function ('idx_to_remove')*

- Processing 'tags' column of data which is string into tuples of entities index and text; *see function ('create_entities)*

- preprocess functions which cleans data and creates some new columns *see function ('preprocess)*

In [3]:
def idxs_to_remove(tags_list, text_list):
    idx_to_remove = []

    for i, x in enumerate(tags_list):
        mx_idx = -1
        for y in x:
            start_idx = int(y.split(':')[0])
            end_idx = int(y.split(':')[1])
            mx_idx = max(mx_idx, start_idx, end_idx)

            if(start_idx>end_idx):
                # print(y.split(':'))
                # print(i,x,"====", start_idx, end_idx)
                idx_to_remove.append(i)
            
            if(mx_idx > len(text_list[i])+1):
                # print(i, x, text_list[i])
                idx_to_remove.append(i)

    return idx_to_remove

def quick_eda(df):
    print(df.info(), "\n")
    print("No. of duplicates in dataframe - {}".format(df.loc[df.duplicated()].shape[0]))
    
    tags = df['tags'].tolist()
    text = df['text'].tolist()
    tags = [[y for y in x.split(',') if len(y)!=0] for x in tags]
    set_labels = set([y.split(':')[-1] for x in tags for y in x])
    print("Unique labels in tags columns: {}".format(set_labels),"\n\n")

    return 

def create_entites(tags, text):
    entities = []
    for x in tags:
        temp = []
        for y in x:
            temp_ls = y.split(':')
            start_idx = int(temp_ls[0])
            end_idx = int(temp_ls[1])
            label = temp_ls[-1]
            tup = (start_idx, end_idx, label)
            # print(tup)
            temp.append(tup)
        entities.append(temp)
    
    return entities

def preprocess(df):
    print('original_shape: {}'.format(df.shape))
    df = df.drop_duplicates()
    print('shape after duplicates drop: {}'.format(df.shape))
    tags = df['tags'].tolist()
    text = df['text'].tolist()
    tags = [[y for y in x.split(',') if len(y)!=0] for x in tags]
    # print(tags[:5])
    entities = create_entites(tags=tags, text=text)
    df['entities'] = entities
    remove_idx_ls = idxs_to_remove(tags, text)
    print("{} indices to remove".format(len(remove_idx_ls)))
    print('Removing_index: {}'.format(remove_idx_ls))
    df = df.drop(remove_idx_ls, axis=0).reset_index()

    print("Final columns: {}".format(df.columns))
    return df


##### Running all the above EDA and processing function in the bleow glob code

In [4]:
# Running all the cleaning and processing on all datasets with glob
dfs = []
for f in glob('../data_source/*.xlsx'):
    print(f,"\n")
    df = pd.read_excel(f)
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(axis=0)
    quick_eda(df)
    df = preprocess(df)

    dfs.append(df)
    print("\n\n-------------------------------------------------------------------------")
    

../data_source/G1.xlsx 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7356 entries, 0 to 7355
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      7356 non-null   object
 1   tags    7356 non-null   object
 2   text    7356 non-null   object
dtypes: object(3)
memory usage: 172.5+ KB
None 

No. of duplicates in dataframe - 167
Unique labels in tags columns: {'treatment', 'allergy_name', 'cancer', 'chronic_disease'} 


original_shape: (7356, 3)
shape after duplicates drop: (7189, 3)
2 indices to remove
Removing_index: [1522, 4236]
Final columns: Index(['index', 'ID', 'tags', 'text', 'entities'], dtype='object')


-------------------------------------------------------------------------
../data_source/G3.xlsx 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6277 entries, 0 to 6276
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      6277 non-null   object

***
### Next steps of Preprocessing - *check docstings of functions for details*
- Changing our entities label and text into IOB format 

- IOB format is converting text into list/array of token and having a same length list/array of labels

- IOB - where I denotes inside a label, B denotes beginning of a label - see below cell (*ner_label_iob*)

- This transformation is used for training and evaluation - given we have selected transformers based training

In [5]:
ner_labels = ['chronic_disease', 'cancer', 'treatment', 'allergy_name']
#ner lablels converted in iob format for data transformation into tokens and IOB entity list
ner_label_iob = ['O', 'B-CHR', 'I-CHR', 'B-CAN', 'I-CAN', 'B-TRE', 'I-TRE', 'B-ALL', 'I-ALL']

id2label = {k: v for k, v in enumerate(ner_label_iob)}
label2id = {v: k for k, v in id2label.items()}

In [6]:
nlp = spacy.load('en_core_web_sm')

def remove_special_characters(s):
    """function to remove special characters which shall be used in get_token_iob_label_list,
        to remove special characters from initial tokens, 
        so that bracketed/abbreviated disease names etc can be include with the disease label"""
    
    pattern = r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$'
    result = re.sub(pattern, '', s)
    return result

def get_token_iob_label_list(text_list, entities_list):
    """function to convert [text and entites] into [token_list and IOB entities] list respectively.
        text is converted to [spacy tokens list] 
        and the ent_text for the entites are put in a dictionary with ner_labels as keys.
        Every elements from [spacy token list] is checked if they are in the text of ner_label in ent_text dictionary
          and then using `beginning and inside flag` IOB entites are appended in [label_list]"""
    
    tokens_list = []
    labels_list = []

    for text, entities in zip(text_list, entities_list):
        ents_text = {ele : "" for ele in ner_labels}

        for y in entities:
            # print(text[y[0]])
            # print(text[y[0]-1:y[1]])
            ents_text[y[-1]] += text[y[0]-1:y[1]]  ## start index always has one positive offset in data, hence y[0]-1

        ents_text[y[-1]] = [remove_special_characters(item) for item in ents_text[y[-1]].split()]
        # print(ents_text)
        doc = nlp(text)
        token_list = [token.text for token in doc]
        label_list = ['O']*len(token_list)

        cd_flag, c_flag, t_flag, a_flag = 0, 0, 0, 0
        for i, token in enumerate(token_list):
            if(token in ents_text['chronic_disease']):
                if(cd_flag==0):
                    label_list[i] = 'B-CHR'
                    cd_flag = 1
                    c_flag, t_flag, a_flag = 0, 0, 0
                else:
                    label_list[i] = 'I-CHR'
            
            elif(token in ents_text['cancer']):
                if(c_flag==0):
                    label_list[i] = 'B-CAN'
                    c_flag = 1
                    cd_flag, t_flag, a_flag = 0, 0, 0
                else:
                    label_list[i] = 'I-CAN'
                
            elif(token in ents_text['treatment']):
                if(t_flag==0):
                    label_list[i] = 'B-TRE'
                    t_flag = 1
                    cd_flag, c_flag, a_flag = 0, 0, 0
                else:
                    label_list[i] = 'I-TRE'
                
            elif(token in ents_text['allergy_name']):
                if(a_flag==0):
                    label_list[i] = 'B-ALL'
                    a_flag = 1
                    cd_flag, c_flag, t_flag= 0, 0, 0
                else:
                    label_list[i] = 'I-ALL'
            else:
                cd_flag, c_flag, t_flag, a_flag = 0, 0, 0, 0
                
        assert(len(token_list)==len(label_list)), "len of token_list and label_list mismatch at some iteration ."
        tokens_list.append(token_list)
        labels_list.append([label2id[label] for label in label_list])

    return (tokens_list, labels_list)
    

### Preparing data for training

- creating function to align labels with tokens because transformers tokenizer would spill our tokens to subtokens and then we need to have labels for those subtokens to be available in our tokenized data labels that will be fed for supervised training.

- **['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']** with transformers tokeinzer would be converted to: <br> **['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']**

- but entities we will provide will be of less length , to align those entity labels in tokenized datasets for same size as input_ids(tokens_list), we will use function *align_labels_with_tokens*

- function *align_labels_with_tokens* - makes the entity labels for subwords created by tokenizer and gives them correct labels and length.

- fucntion *tokenize and align* - just tokenizes all the datasetdict that we shall make for training loop and makes sures that ***inputs_ids and labels in input data*** are of same size and labels for extra tokens generated by tokenizer are correctly in place.

In [7]:
def align_labels_with_tokens(word_ids, labels):
    new_labels = []
    last_word = None
    for word_id in word_ids:
        if(word_id is None):
            new_labels.append(-100)
            last_word = None
        else:
            if(word_id!=last_word):
                label = -100 if word_id is None else labels[word_id]
                new_labels.append(label)
                last_word=word_id
            else:
                label = labels[word_id]
                if(label % 2 == 1):
                    label += 1
                new_labels.append(label)
    
    return new_labels

def tokenize_and_align(examples):
    """tokenizer function that shall be used to map datasetdicts into tokenized data with tensors, truncated and padded;
      and well aligned input_labels and labels, before training"""
    
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    all_labels = examples['ner_tags']
    new_labels = []
    for i, label in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        labels = align_labels_with_tokens(word_ids=word_ids, labels=label)
        new_labels.append(labels)
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

def compute_metrics(eval_preds):
    """Compute metrics for training pipeline to evaluate metrics on validations set while training"""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens like [CLS] and [SEP] or any unknown tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [10]:
## looping over all dataset to convert existing dataset into IOB format for entites and text into list of tokens for training with transformers model

for i, x in tqdm(enumerate(dfs)):
    print(x.shape)
    tokens, labels = get_token_iob_label_list(x['text'].tolist(), x['entities'].tolist())
    # dfs[i] = pd.DataFrame(data = {'tokens': tokens,
    #                               'labels': labels})
    print(x.shape, len(tokens), len(labels))
    x['tokens'] = tokens
    x['ner_tags'] = labels
    dfs[i] = x[['tokens', 'ner_tags']]

0it [00:00, ?it/s]

(7187, 5)


1it [00:32, 32.93s/it]

(7187, 5) 7187 7187
(6277, 5)


2it [01:00, 29.50s/it]

(6277, 5) 6277 6277
(6452, 5)


3it [01:27, 29.11s/it]

(6452, 5) 6452 6452





In [11]:
len(dfs)

3

In [12]:
"""
creating datasetdicts for continous training with 100 samples moving to every next train dataset
and train_test_val_split for every dataset and creating another `test_dataset_including_prev_tests` dataset including test data of previous datasets
for evaluation
"""

ls_datadicts = []
for i, x in enumerate(dfs):
    temp_datadict = DatasetDict({'train' : Dataset.from_pandas(x)})
    # print(temp_datadict)
    if(i-1>=0):
        sample_prev_100 = ls_datadicts[i-1]['train'].select(np.random.randint(0, (ls_datadicts[i-1]['train'].num_rows), size=100))
        temp_datadict['train'] = concatenate_datasets([temp_datadict['train'], sample_prev_100])

    temp_datadict = temp_datadict['train'].train_test_split(0.2, seed=42, shuffle=True)
    val_data = temp_datadict['train'].train_test_split(0.1, shuffle=True)
    temp_datadict['validation'] = val_data['test']
    temp_datadict['train']= val_data['train']
    temp_datadict['test_prevs'] = temp_datadict['test']
    
    if(i-1>=0):
        temp_datadict['test_prevs'] = concatenate_datasets([temp_datadict['test_prevs'], ls_datadicts[i-1]['test_prevs']])

    ls_datadicts.append(temp_datadict)
    # print(ls_datadicts)

final_dataset = Dataset.from_pandas(pd.concat(dfs)).train_test_split(test_size=0.2, seed=42, shuffle=True)
final_dataset = final_dataset.remove_columns(column_names=['__index_level_0__'])
val_data = final_dataset['train'].train_test_split(0.1, shuffle=True)
final_dataset['validation'] = val_data['test']
final_dataset['train']= val_data['train']
final_dataset['test_prevs'] = ls_datadicts[-1]['test_prevs']
ls_datadicts.append(final_dataset)

ls_datadicts = ls_datadicts[-1:]   #only one item for memeory fot (G1+G2+G3) training, comment for full pipeline run with all models training 
print(len(ls_datadicts))

1


In [36]:
ls_datadicts

[DatasetDict({
     train: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 14338
     })
     test: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 3984
     })
     validation: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 1594
     })
     test_prevs: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 4025
     })
 })]

In [37]:
## Shows 100 addition
# print(dfs[1].shape[0])
# print(ls_datadicts[1]['train'].num_rows + ls_datadicts[1]['test'].num_rows +ls_datadicts[1]['validation'].num_rows)

##### Initialising model check points in and out of the training loop, which will be consumed in every iteration of continous learning

In [38]:
model_ckpt_in = ['bert-base-uncased', 'raunak6898/bert-finetuned-ner-t1',
                  'raunak6898/bert-finetuned-ner-t2','bert-base-cased']
model_ckpt_eval = ['raunak6898/bert-finetuned-ner-t1',
                  'raunak6898/bert-finetuned-ner-t2','raunak6898/bert-finetuned-ner-t3', 'raunak6898/bert-finetuned-ner-all_data']

In [39]:
model_ckpt_in

['bert-base-uncased',
 'raunak6898/bert-finetuned-ner-t1',
 'raunak6898/bert-finetuned-ner-t2',
 'bert-base-cased']

## Training Pipeline

This was run in two breaks beacuse of MPS memory issues with the final (G1+G2+G3) training, the current 
code is left for last iter (iter = 3 in loop) with a break statement for the final training.

- The training loop runs over the [model_ckpt_in] list and has corresponding [model_ckpt_eval] values that are model names pushed to huggingface-hub after every subsequent training

- metric used is 'seqeval' sequence labeling evaluator, its standard norm for all sort of chunking tasks like ner evaluation. 

- result are stored in a list of dictionaries one for same test_set scores and another with current + prev test_sets scores

- then that list of scores is converted into table of scores as desired 

In [17]:
results_self_ls = []
results_self_prev_ls = []

results_self_ls_final = []
results_self_prev_ls_final = []

for iter in tqdm(range(len(model_ckpt_in))):
    iter = 3        # comment for full pipeline run in one go
    print("\n---------------------------------------Task {} with base model -> {}-----------------------------------------\n".format(iter+1, model_ckpt_in[iter]))
    model_ckpt = model_ckpt_in[iter]
    model_ckpt_out = model_ckpt_eval[iter]
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_ckpt)
    data_collater = DataCollatorForTokenClassification(tokenizer=tokenizer)
    model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=id2label, label2id=label2id)
    metric = evaluate.load('seqeval')

    tokenized_datasets = ls_datadicts[0].map(
        tokenize_and_align,
        batched=True,
        remove_columns=ls_datadicts[0]["train"].column_names,
    )

    args = TrainingArguments(
        output_dir=model_ckpt_eval[iter],
        evaluation_strategy='epoch',
        # logging_steps=100,
        # logging_strategy='steps',
        save_strategy='epoch',
        learning_rate=2e-5,
        weight_decay=0.01,
        num_train_epochs=3,
        use_mps_device=True,
        load_best_model_at_end=True,
        push_to_hub=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collater,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    mertic = evaluate.load('seqeval')
    test_preds = trainer.predict(tokenized_datasets['test'])

    for i in range(len(tokenized_datasets['test'])):
        y_true = [id2label[id] for id in tokenized_datasets['test'][i]['labels'] if id != -100]
        y_pred = [id2label[id] for id in np.argmax(test_preds[0][i], axis=1)[1:len(y_true)+1]]
        metric.add_batch(predictions=[y_pred], references=[y_true])

    results_self = metric.compute()                 
    # results_self_ls.append(results_self)          # uncomment for full pipeline run in one go
    results_self_ls_final.append(results_self)      # comment for full pipeline run in one go
    print("\n\n-------------------------------------------------------------Result on test_set of this dataset ------------------------------------------------------------- \n\n",(results_self),"\n\n")

    metric = evaluate.load('seqeval')
    test_preds_prev = trainer.predict(tokenized_datasets['test_prevs'])
    for i in range(len(tokenized_datasets['test_prevs'])):
        y_true = [id2label[id] for id in tokenized_datasets['test_prevs'][i]['labels'] if id != -100]
        y_pred = [id2label[id] for id in np.argmax(test_preds_prev[0][i], axis=1)[1:len(y_true)+1]]
        metric.add_batch(predictions=[y_pred], references=[y_true])

    results_self_prevs = metric.compute()
    # results_self_ls_prevs.append(results_self_prevs)          # uncomment for full pipeline run in one go
    results_self_prev_ls_final.append(results_self_prevs)       # comment for full pipeline run in one go
    
    print("------------------------------------Result on test_set including all previous test sets------------------------------- \n\n",results_self_prevs, "\n\n")

    print("------------------------------------Training and Evaluation complete--------------------------------------------------")
    print("------------------------------------Model -> {} pushed to huggingface-hub ------------------------------------------".format(model_ckpt_eval[iter]))
    break



  0%|          | 0/4 [00:00<?, ?it/s]


---------------------------------------Task 4 with base model -> bert-base-cased-----------------------------------------



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14338 [00:00<?, ? examples/s]

Map:   0%|          | 0/3984 [00:00<?, ? examples/s]

Map:   0%|          | 0/1594 [00:00<?, ? examples/s]

Map:   0%|          | 0/4025 [00:00<?, ? examples/s]

  0%|          | 0/5379 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 0/4 [02:32<?, ?it/s]

{'loss': 0.5983, 'learning_rate': 1.8140918386317162e-05, 'epoch': 0.28}


  0%|          | 0/4 [04:50<?, ?it/s]

{'loss': 0.4361, 'learning_rate': 1.6281836772634322e-05, 'epoch': 0.56}


  0%|          | 0/4 [06:56<?, ?it/s]

{'loss': 0.3768, 'learning_rate': 1.4422755158951478e-05, 'epoch': 0.84}


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/4 [08:34<?, ?it/s]Checkpoint destination directory raunak6898/bert-finetuned-ner-all_data/checkpoint-1793 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.3464497923851013, 'eval_precision': 0.5816847479259732, 'eval_recall': 0.5537667071688943, 'eval_f1': 0.5673825085589792, 'eval_accuracy': 0.8780963780183181, 'eval_runtime': 24.8476, 'eval_samples_per_second': 64.151, 'eval_steps_per_second': 8.049, 'epoch': 1.0}


  0%|          | 0/4 [09:25<?, ?it/s]

{'loss': 0.3382, 'learning_rate': 1.2563673545268638e-05, 'epoch': 1.12}


  0%|          | 0/4 [11:22<?, ?it/s]

{'loss': 0.2999, 'learning_rate': 1.0704591931585797e-05, 'epoch': 1.39}


  0%|          | 0/4 [13:20<?, ?it/s]

{'loss': 0.2919, 'learning_rate': 8.845510317902957e-06, 'epoch': 1.67}


  0%|          | 0/4 [15:18<?, ?it/s]

{'loss': 0.2674, 'learning_rate': 6.986428704220116e-06, 'epoch': 1.95}


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/4 [15:50<?, ?it/s]

{'eval_loss': 0.3223128616809845, 'eval_precision': 0.5896296296296296, 'eval_recall': 0.6044957472660997, 'eval_f1': 0.5969701514924254, 'eval_accuracy': 0.8876977518734388, 'eval_runtime': 10.2839, 'eval_samples_per_second': 155.0, 'eval_steps_per_second': 19.448, 'epoch': 2.0}


  0%|          | 0/4 [17:31<?, ?it/s]

{'loss': 0.2335, 'learning_rate': 5.127347090537274e-06, 'epoch': 2.23}


  0%|          | 0/4 [19:29<?, ?it/s]

{'loss': 0.2139, 'learning_rate': 3.268265476854434e-06, 'epoch': 2.51}


  0%|          | 0/4 [21:26<?, ?it/s]

{'loss': 0.2196, 'learning_rate': 1.4091838631715934e-06, 'epoch': 2.79}


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/4 [23:06<?, ?it/s]

{'eval_loss': 0.3217698037624359, 'eval_precision': 0.5944218477629285, 'eval_recall': 0.6215066828675577, 'eval_f1': 0.6076626076626077, 'eval_accuracy': 0.894150707743547, 'eval_runtime': 10.1386, 'eval_samples_per_second': 157.22, 'eval_steps_per_second': 19.727, 'epoch': 3.0}


  0%|          | 0/4 [23:09<?, ?it/s]

{'train_runtime': 1383.198, 'train_samples_per_second': 31.097, 'train_steps_per_second': 3.889, 'train_loss': 0.3194687732158263, 'epoch': 3.0}


  0%|          | 0/498 [00:00<?, ?it/s]



-------------------------------------------------------------Result on test_set of this dataset ------------------------------------------------------------- 

 {'ALL': {'precision': 0.611764705882353, 'recall': 0.5621621621621622, 'f1': 0.5859154929577466, 'number': 185}, 'CAN': {'precision': 0.5642915642915642, 'recall': 0.5666118421052632, 'f1': 0.5654493229380386, 'number': 1216}, 'CHR': {'precision': 0.6043336301573167, 'recall': 0.6260762607626076, 'f1': 0.6150128379398881, 'number': 3252}, 'TRE': {'precision': 0.5913068021450748, 'recall': 0.6348484848484849, 'f1': 0.6123045447902967, 'number': 3300}, 'overall_precision': 0.5930386607250392, 'overall_recall': 0.6191374324154407, 'overall_f1': 0.6058070866141733, 'overall_accuracy': 0.8936436675475743} 




  0%|          | 0/504 [00:00<?, ?it/s]

  0%|          | 0/4 [26:09<?, ?it/s]

------------------------------------Result on test_set including all previous test sets------------------------------- 

 {'ALL': {'precision': 0.6349206349206349, 'recall': 0.7100591715976331, 'f1': 0.6703910614525138, 'number': 169}, 'CAN': {'precision': 0.6359516616314199, 'recall': 0.6373959121877366, 'f1': 0.6366729678638943, 'number': 1321}, 'CHR': {'precision': 0.6936236391912908, 'recall': 0.7165809768637532, 'f1': 0.704915441757547, 'number': 3112}, 'TRE': {'precision': 0.6676008968609866, 'recall': 0.6878429107710078, 'f1': 0.6775707580713981, 'number': 3463}, 'overall_precision': 0.6718900675024108, 'overall_recall': 0.6911345319280843, 'overall_f1': 0.6813764439826416, 'overall_accuracy': 0.9272658074205129} 


------------------------------------Training and Evaluation complete--------------------------------------------------
------------------------------------Model -> raunak6898/bert-finetuned-ner-all_data pushed to huggingface-hub --------------------------------------




In [34]:
## takes output list of metric dictionaries and converts in desired table3 output as stated in problem statement.

def measures_out(results_self_ls):
    df = pd.DataFrame()
    for i, dict in enumerate(results_self_ls):
        # print(dict)
        temp = {k.split('_')[-1]: v for k, v in dict.items() if k[0]=='o'}
        temp['number'] = temp.pop('accuracy')
        temp['number'] = 0
        # print(temp)
        one = {k :v for k, v in dict.items() if k[0]!='o'}
        one['overall'] = temp
        # print(one)
        df = pd.concat([df, pd.DataFrame(one)])

    index = [x for x in df.index.tolist() if x=='f1']
    df = df.loc[index].drop_duplicates()
    ['chronic_disease', 'cancer', 'treatment', 'allergy_name']
    columns = {'ALL': 'allergy_name',
            'CAN': 'cancer',
            'TRE': 'treatment',
            'CHR': 'chronic_disease'}
    df = df.rename(columns=columns)
    df.index = ['T1+T2+T3']
    return df.transpose()

print('self_test_f1s: \n\n', measures_out(results_self_ls_final),"\n")

print('self_plus_prevs_test_f1s: \n\n',measures_out(results_self_prev_ls_final))

self_test_f1s: 

                  T1+T2+T3
allergy_name     0.585915
cancer           0.565449
chronic_disease  0.615013
treatment        0.612305
overall          0.605807 

self_plus_prevs_test_f1s: 

                  T1+T2+T3
allergy_name     0.670391
cancer           0.636673
chronic_disease  0.704915
treatment        0.677571
overall          0.681376


In [29]:
# measures_out(results_self_ls_final).to_excel('self_result.csv')
# measures_out(results_self_prev_ls_final).to_excel('self_prevs_result1.csv')

In [2]:
import numpy as np
ls = []
for i in range(1,9):
    ls.append(0.3/i)

np.mean(ls)

0.10191964285714286

In [None]:
o.8