In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import tqdm

In [None]:
%cd drive/MyDrive/Colab Notebooks/info_extraction/synth_notes

In [None]:
!wget https://dl.fbaipublicfiles.com/biolm/RoBERTa-base-PM-hf.tar.gz
!tar -zxvf RoBERTa-base-PM-hf.tar.gz

In [None]:
%pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [None]:
%%shell
git clone https://github.com/NVIDIA/apex.git
cd apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" ./
cd ../

In [None]:
%%shell
git clone git@github.com:huggingface/transformers.git
cd transformers
git reset --hard 601ac5b1dc1438f00d09696588f2deb0f045ae3b
pip install -e .
cd ..

In [None]:
from datasets import load_dataset, Dataset
import evaluate
import numpy as np
import os
import pandas as pd
import tokenizers
from transformers import DataCollatorForLanguageModeling, RobertaTokenizer, RobertaForMaskedLM, RobertaModel, RobertaConfig, AutoModel, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
config = RobertaConfig.from_pretrained("bio-lm/RoBERTa-base-PM-hf", max_position_embeddings=514, vocab_size=50272)
model = RobertaForMaskedLM.from_pretrained("bio-lm/RoBERTa-base-PM-hf", config=config, ignore_mismatched_sizes=True)

In [None]:
# get data
# read csv to pd df
df = pd.read_csv('gpt_synth_notes/detailed_withhistory_augmented.csv')
df = pd.DataFrame(df['text'])
df['labels'] = [0] * df.shape[0]
# make a dataset out of it
dataset = Dataset.from_pandas(df)

In [None]:
# preprocess
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def tokenize_function(examples):  # these three lines stolen from huggingface
    return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset = dataset.map(tokenize_function, batched=True)
# shuffle and split dataset
shuffled_dataset = dataset.shuffle(seed=42)
shuffled_dataset = shuffled_dataset.train_test_split(test_size=0.1)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer('vocab/vocab.json', 'vocab/merges.txt')
tokenizer._tokenizer.post_processor = tokenizers.processors.RobertaProcessing(
  sep=("</s>", tokenizer._tokenizer.token_to_id("</s>")),
  cls=("<s>", tokenizer._tokenizer.token_to_id("<s>"))
)
tokenizer.enable_truncation(max_length=512)
tokenizer.encode("Patient will follow up with a gastroenterologist.").tokens

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", 
                                  per_device_train_batch_size=8, per_device_eval_batch_size=8)

# eval
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(model=model, args=training_args, train_dataset=shuffled_dataset['train'], 
                  eval_dataset=shuffled_dataset['test'],
                  data_collator=data_collator)
trainer.train()

In [None]:
trainer.save_model('roberta-trained-on-synth-mlm')