In [None]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "allenai/scibert_scivocab_uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

model.summary()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from datasets import load_dataset
train_dataset = load_dataset("csv", data_files={"arXiv_dataset.csv"}, split="train[:25%]")
train_dataset2 = load_dataset("csv", data_files={"arXiv_dataset.csv"}, split="train[25%:50%]")
train_dataset3 = load_dataset("csv", data_files={"arXiv_dataset.csv"}, split="train[50%:75%]")
test_dataset = load_dataset("csv", data_files={"arXiv_dataset.csv"}, split="train[75%:100%]")
corpus_dataset = load_dataset("csv", data_files={"acronym_train.csv", "acronym_test.csv"})

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["masked"], padding='max_length', max_length=70, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    return result

def tokenize_labels(examples):
    result = tokenizer(examples["outputs"], padding='max_length', max_length=70, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        result["labels"] = result["input_ids"].copy()
    
    return result

In [None]:
# Tokenizes the labels

tokenized_train = train_dataset.map(
    tokenize_labels, batched=True, remove_columns=["outputs"]
)

# Tokenizes the input

tokenized_train = tokenized_train.map(
    tokenize_function, batched=True, remove_columns=["paper_id", "word_ids", "masked", "input_ids", "token_type_ids", "attention_mask"]
)

In [None]:
tf_train = tokenized_train.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=["labels"],
    shuffle = True,
    # collate_fn=data_collator,
    batch_size=32,
)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]

In [None]:
model.fit(tf_train, epochs=1)

In [None]:
tokenizer.save_pretrained("in_progress/model")

In [None]:
model.save_pretrained('in_progress/model')