In [69]:
import os
import pathlib
import pandas as pd
import ast
import json
import numpy as np

In [59]:
from datasets import ClassLabel

In [2]:
parent_directory = os.getcwd()
print(parent_directory)

/home/ec2-user/SageMaker/dagpap24-submission/notebooks


In [3]:
contesting_models = ['roberta', 'scibert', 'deberta', 'biomed_roberta']

In [4]:
model_path = []
for model in contesting_models:
    model_path.append(f'{parent_directory}/data/output_dev_{model}')
print(model_path)

['/home/ec2-user/SageMaker/dagpap24-submission/notebooks/data/output_dev_roberta', '/home/ec2-user/SageMaker/dagpap24-submission/notebooks/data/output_dev_scibert', '/home/ec2-user/SageMaker/dagpap24-submission/notebooks/data/output_dev_deberta', '/home/ec2-user/SageMaker/dagpap24-submission/notebooks/data/output_dev_biomed_roberta']


In [5]:
from transformers import AutoModelForTokenClassification

In [None]:
model_roberta = AutoModelForTokenClassification.from_pretrained(model_path[0])

In [None]:
model_roberta.push_to_hub('TheOptimusPrimes/roberta-finetuned-dagpap24',
                          "Pushing Roberta fine-tuned model")

In [None]:
from transformers import RobertaTokenizerFast

roberta_tokenizer = RobertaTokenizerFast.from_pretrained('FacebookAI/roberta-base',
        cache_dir=model_path[0],
        use_fast=True,
        revision="main",
        use_auth_token=None,
        add_prefix_space=True,
    )
roberta_tokenizer.pad_token = roberta_tokenizer.eos_token

In [None]:
roberta_tokenizer.push_to_hub('TheOptimusPrimes/roberta-finetuned-dagpap24',
                          "Pushing Roberta fast tokenizer")

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

test_uploaded_tokenizer = AutoTokenizer.from_pretrained("TheOptimusPrimes/roberta-finetuned-dagpap24")
test_uploaded_model = AutoModelForTokenClassification.from_pretrained("TheOptimusPrimes/roberta-finetuned-dagpap24")

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
        test_uploaded_tokenizer, pad_to_multiple_of=8 if False else None
    )

In [71]:
import evaluate

In [74]:
metric = evaluate.load("f1")
metric

EvaluationModule(name: "f1", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.
    pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.
    average (`string`): This parameter is required for multiclass/multilabel t

In [77]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    merged_predictions = [
        label_list[p]
        for (p, l) in zip(predictions.flatten(), labels.flatten())
        if l != -100
    ]
    merged_labels = [
        label_list[l]
        for (p, l) in zip(predictions.flatten(), labels.flatten())
        if l != -100
    ]
    
    results = metric.compute(
        predictions=merged_predictions,
        references=merged_labels,
        average="macro",
    )

    # if data_args.return_entity_level_metrics:
    if False:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "f1": results["f1"],
        }


In [78]:
from transformers import Trainer

trainer = Trainer(
        model=test_uploaded_model,
        # args=training_args,
        # train_dataset=train_dataset if training_args.do_train else None,
        # eval_dataset='data/data_gen_content_val_roberta.json',
        tokenizer=test_uploaded_tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [54]:
from datasets import load_dataset

my_datasets = load_dataset('json', data_files={'test': 'data/data_gen_content_dev_roberta.json', 
                                               'validation': 'data/data_gen_content_val_roberta.json'})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import load_dataset

my_datasets = load_dataset('parquet', data_files={'test': 'data/dev_data.parquet'})

In [56]:
features = my_datasets["validation"].features
print(features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'index': Value(dtype='int64', id=None), 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [57]:
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    # label_list = list(str(x) for x in range(4))
    return label_list

In [63]:
label_column_name = 'ner_tags'

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    label_to_id = {i: i for i in range(len(label_list))}
else:
    label_list = get_label_list(my_datasets["validation"][label_column_name])
    label_to_id = {l: i for i, l in enumerate(label_list)}
    
print(label_list)
print(label_to_id)

['0', '1', '2', '3']
{'0': 0, '1': 1, '2': 2, '3': 3}


In [66]:
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    if type(examples['tokens'][0]) is bytes:
        print("Hello")
        examples["tokens"] = [ast.literal_eval(x.decode()) for x in examples['tokens']] # examples['tokens'].map(lambda x:ast.literal_eval(x.decode()))
        
#     print(type(examples['tokens']))
#     print(type(examples['tokens'][0]))
#     print(examples['tokens'][0])
    
    tokenized_inputs = test_uploaded_tokenizer(
        examples['tokens'],
        padding='max_length',
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists
        # of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label
            # to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label
            # to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(
                    label_to_id[label[word_idx]]
                    # if data_args.label_all_tokens
                    if False
                    else -100
                )
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [67]:
test_dataset = my_datasets["test"]

# Just for the purpose of testing, restricting the number of rows to be tested
test_dataset = test_dataset.select(range(500))

test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=16,
    # load_from_cache_file=not data_args.overwrite_cache,
    load_from_cache_file=False,
)

Map (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


In [48]:
test_dataset = my_datasets["test"]

# Just for the purpose of testing, restricting the number of rows to be tested
# test_dataset = test_dataset.select(range(500))

test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=16,
    # load_from_cache_file=not data_args.overwrite_cache,
    load_from_cache_file=False,
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Hello


KeyError: 'ner_tags'

In [None]:
from transformers import RobertaTokenizerFast

imported_tokenizer = RobertaTokenizerFast.from_pretrained(
        'FacebookAI/roberta-base',
        use_fast=True,
        revision="main",
        use_auth_token=None,
        add_prefix_space=True,
    )
imported_tokenizer.pad_token = imported_tokenizer.eos_token

In [21]:
test_dataset.head()

Unnamed: 0,index,tokens,ner_tags
0,12313,"[Phylogenetic, networks, are, a, generalizatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,12313,"[This, raises, the, question, of, whether, lev...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,12313,"[x, in, N, ,, or, •, S, is, an, arc, side, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,12313,"[The, pseudo, code, is, in, Algorithm, 1, ., W...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,12313,"[First, let, (, x, ,, y, ), be, an, arc, of, Ω...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [79]:
# Run the predictions on the model that was finetuned
predictions, labels, metrics = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [81]:
output_json_predictions_file = "data/test_finetuned_predictions_roberta.json"

data_list = []
for i in range(len(predictions)):
    data_list.append(
        {
            "index": test_dataset[i]["index"],
            "predictions": predictions[i].tolist(),
        }
    )
with open(output_json_predictions_file, "w") as f:
    f.write(json.dumps(data_list))

In [83]:
import logging
logger = logging.getLogger(__name__)

In [84]:

def convert_preds_to_original_format(
    path_to_test_data: str = "",
    path_to_test_preds: str = "",
    path_to_final_output: str = "",
):
    """
    This function takes the chunked preds and groups them into the original format
    """
    logger.info(f"Original Test Data Path: {path_to_test_data}")
    logger.info(f"Test Set Predictions path:{path_to_test_preds}")
    logger.info(f"Final Output Path:{path_to_final_output}")
    orig_test_data = pd.read_parquet(path_to_test_data, engine="fastparquet")
    if orig_test_data.index.name != "index":
        orig_test_data.set_index("index", inplace=True)
    logger.info(f"Original Test Data Loaded, {orig_test_data.shape}")
    
    with open(path_to_test_preds, "r") as f:
        test_preds = json.load(f)

    test_preds_df = pd.DataFrame(test_preds).groupby(by="index").agg(list)

    logger.info(f"Original Test DF = {orig_test_data.columns}, \
                  Index Range = {max(orig_test_data.index.tolist())}, {min(orig_test_data.index.tolist())},\
                  Original Test DF Shape = {orig_test_data.shape}")
    logger.info(f"Predicted DF before apply = {test_preds_df.columns}")
    test_preds_df["preds"] = test_preds_df["predictions"].apply(
        lambda x: sum(x, [])
    )
    
    logger.info(f"Predicted DF after apply Info")
    logger.info(f"Predictions after DF = {test_preds_df.columns}, \
                  Index Range = {max(test_preds_df.index.tolist())}, {min(test_preds_df.index.tolist())},\
                  Original Test DF Shape = {test_preds_df.shape}")


    for index, row in test_preds_df.iterrows():
        #logger.info(f"Checking Index = {index}")
        #logger.info(f"Original Length = {len(orig_test_data.loc[index, 'tokens'])}")
        #logger.info(f"Predicted Length = {len(row['preds'])}")
        #logger.info(f"Original Values = {orig_test_data.loc[index, 'tokens']}")
        #logger.info(f"Predicted Values = {test_preds_df.at[index, 'preds']}")
        if len(row["preds"]) > len(orig_test_data.loc[index, "tokens"]):
            test_preds_df.at[index, "preds"] = row["preds"][
                : len(orig_test_data.loc[index, "tokens"])
            ]

        elif len(row["preds"]) < len(orig_test_data.loc[index, "tokens"]):
            test_preds_df.at[index, "preds"] = row["preds"] + [0 for _ in range(
                len(orig_test_data.loc[index, "tokens"]) - len(row["preds"]))] 
    for index, row in test_preds_df.iterrows():
        #logger.info(f"Checking Index = {index}")
        assert len(row["preds"]) == len(orig_test_data.loc[index, "tokens"])

    pd.DataFrame(test_preds_df["preds"]).to_parquet(path_to_final_output)
    print(f"final dataset saved to {path_to_final_output}")

    return None

In [85]:
convert_preds_to_original_format('data/dev_data.parquet', output_json_predictions_file, 
                                 'data/finetined_model_predictions_roberta.parquet')

final dataset saved to finetined_model_predictions_roberta.parquet


In [None]:
# Save predictions
output_test_predictions_file = os.path.join(
    training_args.output_dir, "test_predictions_roberta.txt"
)

output_json_predictions_file = os.path.join(
    training_args.output_dir, "test_predictions_roberta.json"
)

if trainer.is_world_process_zero():
    with open(output_test_predictions_file, "w") as writer:
        for prediction in true_predictions:
            writer.write(" ".join([str(i) for i in prediction]) + "\n")

    # save preds to json
    assert len(predictions) == len(test_dataset)

    

In [None]:
data_list = []
for i in range(len(predictions)):
    data_list.append(
        {
            "index": test_dataset[i]["index"],
            "predictions": predictions[i].tolist(),
        }
    )
with open("data/roberta_predictions_from_pretrained_test.json", "w") as f:
    f.write(json.dumps(data_list))

In [7]:
from transformers import TokenClassificationPipeline

In [11]:
roberta_pipeline = TokenClassificationPipeline('roberta_finetuned_token_classification',
                                               model=test_uploaded_model)
# , 
#                                               tokenizer=test_uploaded_tokenizer)

In [9]:
test_dataset = pd.read_json('data/data_gen_content_dev_roberta.json')
test_dataset.head()

Unnamed: 0,index,tokens,ner_tags
0,12313,"[Phylogenetic, networks, are, a, generalizatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,12313,"[This, raises, the, question, of, whether, lev...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,12313,"[x, in, N, ,, or, •, S, is, an, arc, side, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,12313,"[The, pseudo, code, is, in, Algorithm, 1, ., W...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,12313,"[First, let, (, x, ,, y, ), be, an, arc, of, Ω...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
type(test_dataset['tokens'][0])

In [None]:
test_dataset['tokens'][0][1:5]

In [None]:
type(roberta_pipeline)

In [12]:
test_roberta_preds_from_finetuned = roberta_pipeline(['networks'])
test_roberta_preds_from_finetuned.head()

TypeError: 'str' object is not callable

In [None]:
dir(roberta_pipeline.predict)

In [None]:
# Get a pre-trained model

model_1 = 

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="TheOptimusPrimes/scibert-finetuned-dagpap24")

In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("TheOptimusPrimes/scibert-finetuned-dagpap24")