In [1]:
import os
import pathlib
import pandas as pd
import ast
import json
import numpy as np
from pathlib import Path
import fastparquet
from tqdm.auto import tqdm

In [2]:
import logging

logger = logging.getLogger(__name__)

In [3]:
from datasets import ClassLabel, load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer
import evaluate

In [19]:
from sklearn.metrics import f1_score

In [4]:
## Note: The test file is expected as <parent_directory>/data/test_data.parquet

# TODO: Should we do literal_eval on the test data?

In [18]:
contesting_models = ['roberta', 'scibert', 'deberta', 'biomed_roberta', 'cs_roberta']

## Functions to be called in the order of execution:
0. Save the test file as notebooks/data/test_data.parquet
1. pre_process_test_file(model_name)
2. get_pretrained_models(config_params, my_datasets)
3. make_and_save_predictions()
4. post_process_prediction()


In [6]:
def model_pipeline(model_name, filename):
    config = get_config_params(model_name, filename)
    test_datasets = pre_process_test_file(model_name, config)
    trainer, test_dataset = get_pretrained_models(config, test_datasets)
    make_and_save_predictions(trainer, test_dataset, config)
    post_process_prediction(config)

In [7]:
def get_config_params(model_name, filename):
    config = {'max_length': 512, 'parent_directory': os.getcwd(),
             'test_filepath': f'data/{filename}.parquet', 'model_name': model_name, 
              'json_test_filepath': f'data/data_gen_content_{filename}_{model_name}.json',
              'intermediate_extension': 'json', 'label_column_name': 'ner_tags',
              'label_list': ['0', '1', '2', '3'], 'label_to_id': {'0': 0, '1': 1, '2': 2, '3': 3},
              'padding': "max_length", 'batched': True, 'num_proc': 16,
              'metric': evaluate.load("f1"), 'return_entity_level_metrics': False,
              'output_json_predictions_file': f"data/{filename}_finetuned_predictions_{model_name}.json",
              'output_parquet_predictions_file': f"data/{filename}_predictions_{model_name}.parquet",
             }
 
    return config

In [8]:
def pre_process_test_file(model_name, config_params):
    convert_parquet_data_to_json(config_params['parent_directory'], config_params['test_filepath'], 
                                 config_params['json_test_filepath'], config_params)
    # Load the test data set
    my_datasets = load_dataset(config_params['intermediate_extension'],
                               data_files={'test': config_params['json_test_filepath']})
    return my_datasets

In [9]:
def get_pretrained_models(config_params, my_datasets):
    model_name = config_params['model_name']
    logger.info(f"Loading the model and tokenizer from fine tuned model {model_name}")
    
    finetuned_tokenizer = AutoTokenizer.from_pretrained(f"TheOptimusPrimes/{model_name}-finetuned-dagpap24")
    finetuned_model = AutoModelForTokenClassification.from_pretrained(f"TheOptimusPrimes/{model_name}-finetuned-dagpap24")
    
    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        if type(examples['tokens'][0]) is bytes:
            logger.info("Converting list of bytes to list of string")
            examples["tokens"] = [ast.literal_eval(x.decode()) for x in examples['tokens']]

        tokenized_inputs = finetuned_tokenizer(
            examples['tokens'],
            padding=config_params['padding'],
            truncation=True,
            max_length=config_params['max_length'],
            # We use this argument because the texts in our dataset are lists
            # of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples['ner_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label
                # to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(config_params['label_to_id'][label[word_idx]])
                # For the other tokens in a word, we set the label
                # to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        config_params['label_to_id'][label[word_idx]]
                        # if data_args.label_all_tokens
                        if False
                        else -100
                    )
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    
    
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        merged_predictions = [
            config_params['label_list'][p]
            for (p, l) in zip(predictions.flatten(), labels.flatten())
            if l != -100
        ]
        merged_labels = [
            config_params['label_list'][l]
            for (p, l) in zip(predictions.flatten(), labels.flatten())
            if l != -100
        ]

        results = config_params['metric'].compute(
            predictions=merged_predictions,
            references=merged_labels,
            average="macro",
        )

        if config_params['return_entity_level_metrics']:

            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "f1": results["f1"],
            }
    
    test_dataset = my_datasets["test"]
    test_dataset = test_dataset.map(
        tokenize_and_align_labels,
        batched=config_params['batched'],
        num_proc=config_params['num_proc'],
        # load_from_cache_file=not data_args.overwrite_cache,
        load_from_cache_file=False,
    )
    
    data_collator = DataCollatorForTokenClassification(
        finetuned_tokenizer, pad_to_multiple_of=None
    )

    trainer = Trainer(
        model=finetuned_model,
        tokenizer=finetuned_tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    return trainer, test_dataset

In [10]:
def make_and_save_predictions(my_trainer, test_dataset, config_params):
    # Run the predictions on the model that was finetuned
    predictions, labels, metrics = my_trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [config_params['label_list'][p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    assert len(predictions) == len(test_dataset)
    data_list = []
    for i in range(len(predictions)):
        data_list.append(
            {
                "index": test_dataset[i]["index"],
                "predictions": predictions[i].tolist(),
            }
        )
    with open(config_params['output_json_predictions_file'], "w") as f:
        f.write(json.dumps(data_list))

In [11]:
def post_process_prediction(config_params):
    convert_preds_to_original_format(config_params['test_filepath'],
                                     config_params['output_json_predictions_file'], 
                                     config_params['output_parquet_predictions_file'])

    

In [12]:

def convert_preds_to_original_format(
    path_to_test_data: str = "",
    path_to_test_preds: str = "",
    path_to_final_output: str = "",
):
    """
    This function takes the chunked preds and groups them into the original format
    """
    logger.info(f"Original Test Data Path: {path_to_test_data}")
    logger.info(f"Test Set Predictions path:{path_to_test_preds}")
    logger.info(f"Final Output Path:{path_to_final_output}")
    orig_test_data = pd.read_parquet(path_to_test_data, engine="fastparquet")
    if orig_test_data.index.name != "index":
        orig_test_data.set_index("index", inplace=True)
    logger.info(f"Original Test Data Loaded, {orig_test_data.shape}")
    
    with open(path_to_test_preds, "r") as f:
        test_preds = json.load(f)

    test_preds_df = pd.DataFrame(test_preds).groupby(by="index").agg(list)

    logger.info(f"Original Test DF = {orig_test_data.columns}, \
                  Index Range = {max(orig_test_data.index.tolist())}, {min(orig_test_data.index.tolist())},\
                  Original Test DF Shape = {orig_test_data.shape}")
    logger.info(f"Predicted DF before apply = {test_preds_df.columns}")
    test_preds_df["preds"] = test_preds_df["predictions"].apply(
        lambda x: sum(x, [])
    )
    
    logger.info(f"Predicted DF after apply Info")
    logger.info(f"Predictions after DF = {test_preds_df.columns}, \
                  Index Range = {max(test_preds_df.index.tolist())}, {min(test_preds_df.index.tolist())},\
                  Original Test DF Shape = {test_preds_df.shape}")


    for index, row in test_preds_df.iterrows():
        #logger.info(f"Checking Index = {index}")
        #logger.info(f"Original Length = {len(orig_test_data.loc[index, 'tokens'])}")
        #logger.info(f"Predicted Length = {len(row['preds'])}")
        #logger.info(f"Original Values = {orig_test_data.loc[index, 'tokens']}")
        #logger.info(f"Predicted Values = {test_preds_df.at[index, 'preds']}")
        if len(row["preds"]) > len(orig_test_data.loc[index, "tokens"]):
            test_preds_df.at[index, "preds"] = row["preds"][
                : len(orig_test_data.loc[index, "tokens"])
            ]

        elif len(row["preds"]) < len(orig_test_data.loc[index, "tokens"]):
            test_preds_df.at[index, "preds"] = row["preds"] + [0 for _ in range(
                len(orig_test_data.loc[index, "tokens"]) - len(row["preds"]))] 
    for index, row in test_preds_df.iterrows():
        #logger.info(f"Checking Index = {index}")
        assert len(row["preds"]) == len(orig_test_data.loc[index, "tokens"])

    pd.DataFrame(test_preds_df["preds"]).to_parquet(path_to_final_output)
    print(f"final dataset saved to {path_to_final_output}")

    return None

In [13]:
# Expected param for test_filepath = 'data/test_data.parquet'
# json_test_filepath: data_gen_content_test_roberta.json
def convert_parquet_data_to_json(parent_directory, test_filepath, json_test_filepath, config_params):
    test_df = prep_test_data(
        path_to_file=Path(parent_directory) / Path(test_filepath),
        max_length=config_params['max_length'],
    )
    logger.info("Writing test df to json...")
    write_df_to_json(
        test_df,
        f"{parent_directory}/{json_test_filepath}",
    )

In [14]:
def write_df_to_json(df: pd.DataFrame, path_to_json: str):
    """
    This function writes pandas dataframes into a compatible json format
    to be used by hf_token_classification.py
    """
    index_list = df["index"].values.tolist()
    tokens_list = df["tokens"].values.tolist()
    labels_list = df["labels"].values.tolist()
    data_list = []
    for i in tqdm(range(len(tokens_list)), total=len(tokens_list)):
        index = index_list[i]
        tokens = tokens_list[i]
        labels = [str(el) for el in labels_list[i]]
        data_list.append(
            {"index": index, "tokens": tokens, "ner_tags": labels}
        )
    with open(path_to_json, "w") as f:
        f.write(json.dumps(data_list))

In [15]:
def prep_test_data(path_to_file, max_length):
    logger.info(f"Loading test dataset from file")
    df = pd.read_parquet(path_to_file, engine="fastparquet")
    if df.index.name != "index":
        df.set_index("index", inplace=True)

    # the external NER Classification script needs a target column
    # for the test set as well, which is not available.
    # Therefore, we're subsidizing this column with a fake label column
    # Which we're not using anyway, since we're only using the test set
    # for predictions.
    if "token_label_ids" not in df.columns:
        df["token_label_ids"] = df["tokens"].apply(
            lambda x: np.zeros(len(x), dtype=int)
        )
    df = df[["tokens", "token_label_ids"]]

    logger.info(f"Initial test data length: {len(df)}")
    df = chunk_tokens_labels(df, max_length=max_length)
    logger.info(
        f"Test data length after chunking to max {max_length} tokens: {len(df)}"
    )

    return df

In [16]:
def chunk_tokens_labels(df: pd.DataFrame, max_length: int):
    """
    This function chunks tokens and their respective labels to
    max_length token length
    """
    index_list = []
    tokens_list = []
    labels_list = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        if len(row["token_label_ids"]) > max_length:
            remaining_tokens = row["tokens"]
            remaining_labels = row["token_label_ids"]

            # While the remaining list is larger than max_length,
            # truncate and append
            while len(remaining_labels) > max_length:
                index_list.append(index)
                tokens_list.append(remaining_tokens[:max_length])
                labels_list.append(remaining_labels[:max_length])
                remaining_tokens = remaining_tokens[max_length:]
                remaining_labels = remaining_labels[max_length:]
            # Append last chunk
            index_list.append(index)
            tokens_list.append(remaining_tokens)
            labels_list.append(remaining_labels)
        else:
            index_list.append(index)
            tokens_list.append(row["tokens"])
            labels_list.append(row["token_label_ids"])

    return pd.DataFrame(
        {"index": index_list, "tokens": tokens_list, "labels": labels_list}
    )


In [None]:
%%time
# Get the predictions for the roberta model on the train_data
model_pipeline('roberta', 'train_data')

In [None]:
%%time
# Get the predictions for the scibert model on the train_data
model_pipeline('scibert', 'train_data')

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/56929 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/717k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Map (num_proc=16):   0%|          | 0/56929 [00:00<?, ? examples/s]

In [None]:
%%time
# Get the predictions for the deberta model on the train_data
model_pipeline('deberta', 'train_data')

In [None]:
%%time
# Get the predictions for the biomed_roberta model on the train_data
model_pipeline('biomed_roberta', 'train_data')

In [None]:
# Get the predictions on all the shortlisted models on test_data.parquet and save the predictions parquet file
for model in contesting_models:
    model_pipeline(model, 'test_data')

In [23]:
def merge_train_model_predictions(model_list):
    train_df = pd.read_parquet('data/train_data.parquet', engine='fastparquet')
    train_df = train_df[['text', 'tokens', 'token_label_ids']]
    train_df.rename(columns={'token_label_ids': 'true_labels'}, inplace=True)
    
    merged_df = None
    for model in model_list:
        train_preds_df = pd.read_parquet(f'data/train_data_predictions_{model}.parquet',
                                         engine='fastparquet')
        train_preds_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)
        
        if merged_df is None:
            merged_df = train_preds_df.copy(deep=True)
        else:
            merged_df = merged_df.merge(train_preds_df, how='inner', left_index=True, right_index=True)
        print(merged_df.shape)
    
    merged_train_preds = merged_df.merge(train_df, left_index=True, right_index=True, how='inner')
    
    for model in model_list:
        merged_train_preds[f"{model}_f1_score"] = merged_train_preds.apply(
        lambda x: f1_score(
            x["true_labels"], x[f"{model}_preds"], average="macro"), axis=1,)
        print(f"Average F1 score for {model} train data is: {merged_train_preds[f'{model}_f1_score'].mean()*100:0.4f}%")
    
    merged_train_preds.to_parquet('data/merged_train_predictions.parquet')
    
    return merged_train_preds

In [24]:
%%time
merged_predictions = merge_train_model_predictions(contesting_models)

(5000, 1)
(5000, 2)
(5000, 3)
(5000, 4)
(5000, 5)
Average F1 score for roberta train data is: 89.0101%
Average F1 score for scibert train data is: 89.2137%
Average F1 score for deberta train data is: 90.2066%
Average F1 score for biomed_roberta train data is: 89.0659%
Average F1 score for cs_roberta train data is: 89.0239%
CPU times: user 3min 29s, sys: 5.69 s, total: 3min 35s
Wall time: 3min 35s


In [26]:
merged_predictions.shape

(5000, 13)

In [25]:
merged_predictions.head()

Unnamed: 0_level_0,roberta_preds,scibert_preds,deberta_preds,biomed_roberta_preds,cs_roberta_preds,text,tokens,true_labels,roberta_f1_score,scibert_f1_score,deberta_f1_score,biomed_roberta_f1_score,cs_roberta_f1_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",the world summit on sustainable development fi...,"[the, world, summit, on, sustainable, developm...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0.889083,0.900512,0.906215,0.890868,0.888261
13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bioresorbable implants have progressively move...,"[Bioresorbable, implants, have, progressively,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.830665,0.869474,0.862254,0.830504,0.829883
19,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","For centuries, midwives have encouraged women ...","[For, centuries,, midwives, have, encouraged, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.906498,0.912712,0.919406,0.90463,0.904718
28,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",a study of 170 contacts in the uk reported a 5...,"[a, study, of, 170, contacts, in, the, uk, rep...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0.858801,0.858801,0.864228,0.858801,0.858801
40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",There is a growing concern over potentially el...,"[There, is, a, growing, concern, over, potenti...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,1.0,1.0,1.0,1.0


In [27]:
merged_predictions[['roberta_f1_score', 'scibert_f1_score', 'deberta_f1_score', 
                    'biomed_roberta_f1_score', 'cs_roberta_f1_score']].describe()

Unnamed: 0,roberta_f1_score,scibert_f1_score,deberta_f1_score,biomed_roberta_f1_score,cs_roberta_f1_score
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.890101,0.892137,0.902066,0.890659,0.890239
std,0.072697,0.073494,0.068171,0.072569,0.071771
min,0.467366,0.482727,0.491797,0.467366,0.490759
25%,0.859531,0.861742,0.874075,0.859971,0.85925
50%,0.899727,0.902719,0.910865,0.900814,0.899627
75%,0.933445,0.93669,0.943246,0.93406,0.933423
max,1.0,1.0,1.0,1.0,1.0
