In [None]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "allenai/scibert_scivocab_uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

model.summary()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files={"train":"acronym_train.csv", "test":"acronym_test.csv"})

dataset

In [None]:
dataset["train"][20]

In [None]:
sample = dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"'>>> Input IDs: {row['masked']}")
    print(f"'>>> Outputs: {row['outputs']}")

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["masked"], padding='max_length', max_length=30)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    return result

def tokenize_labels(examples):
    result = tokenizer(examples["outputs"], padding='max_length', max_length=30)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        result["labels"] = result["input_ids"].copy()
    
    return result

In [None]:
tokenized_dataset = dataset.map(
    tokenize_labels, batched=True, remove_columns=["outputs"]
)

tokenized_dataset = tokenized_dataset.map(
    tokenize_function, batched=True, remove_columns=["word_ids", "masked", "input_ids", "token_type_ids", "attention_mask"]
)

tokenized_dataset

In [None]:
from transformers import DataCollatorWithPadding

tf_train = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=["labels"],
    shuffle = True,
    batch_size=32,
)

tf_test = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=["labels"],
    shuffle = True,
    batch_size=32,
)

tf_train

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]

In [None]:
model.fit(tf_train, epochs=25)

In [None]:
def get_length(lf):
    count = 0
    for word in lf.split():
        if ("(" not in word and
            ")" not in word and
            word != "and" and
            word != "or" and
            word != "of"):
            if "-" in word and word[-1] != "- ":
                count += 1
            count +=1
            
    return count

In [None]:
def predict(lf):
    prediction = None
    inputs = tokenizer(lf, return_tensors="np")
    token_logits = model(**inputs).logits
    
    mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
    mask_token_logits = token_logits[0, mask_token_index, :]

    top_token_index_1 = np.argsort(-mask_token_logits)[0]
    
    lf = lf.replace(tokenizer.mask_token, tokenizer.decode(top_token_index_1).upper())
    
    phrase_length = get_length(lf)
    
    if len(tokenizer.decode(top_token_index_1)) < phrase_length:
        lf = lf.replace(")", " " + tokenizer.mask_token + ")")
        inputs = tokenizer(lf, return_tensors="np")
        token_logits = model(**inputs).logits
        mask_token_index2 = mask_token_index + 1
        mask_token_logits2 = token_logits[0, mask_token_index2, :]
        top_token_index_2 = np.argsort(-mask_token_logits2)[0]
        prediction = lf.replace(" " + tokenizer.mask_token, tokenizer.decode(top_token_index_2)[2:].upper())
        
    if prediction == None:
        prediction = lf
        
    return prediction

In [None]:
results = []
for i in range(len(dataset["test"])):
    results.append(predict(dataset["test"][i]["masked"]))

In [None]:
import csv

fields = ["prediction", "gold_labels"]

gold_labels = dataset["test"]["outputs"]

data = zip(results, gold_labels)

with open('predictions.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for item in data:
        writer.writerow(item)

In [None]:
test_results = model.predict(tf_test)
tokenized_dataset["test"]