## Shell stuff



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

### Get requirements and data

In [None]:
%cd drive/MyDrive/Colab Notebooks/info_extraction/synth_notes

In [None]:
%%shell
pip install -r requirements.txt
cd bio-lm
pip install -r requirements.txt
cd ..

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!apt install git-lfs

In [None]:
!wget https://dl.fbaipublicfiles.com/biolm/RoBERTa-base-PM-hf.tar.gz
!tar -zxvf RoBERTa-base-PM-hf.tar.gz

In [None]:
%%shell
git clone https://github.com/huggingface/transformers.git
cd transformers
git reset --hard 601ac5b1dc1438f00d09696588f2deb0f045ae3b
pip install -e .
cd ..

In [None]:
%%shell
git clone https://github.com/ncbi-nlp/BLUE_Benchmark.git
cd BLUE_Benchmark
git reset --hard b6216f2cb9bba209ee7028fc874123d8fd5a810c
cd ..

In [None]:
!wget https://raw.githubusercontent.com/spyysalo/conlleval.py/master/conlleval.py

In [None]:
%%shell
git clone https://github.com/facebookresearch/bio-lm.git

In [None]:
%%shell
bash bio-lm/preprocessing/download_all_task_data.sh
cd ..
bash bio-lm/preprocessing/preprocess_all_classification_datasets.sh
cd bio-lm

## Classification task
* SynthCLIP data: base vs synth vs CLIP
* CLIP data: base vs synth vs CLIP

### SynthCLIP

Test base model on SynthCLIP

In [None]:
%%shell
TASK="SynthCLIP"
DATADIR="../data/tasks/SynthCLIP"
MODEL=RoBERTa-base-PM-hf
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model \
    --do_test\
    --overwrite_output_dir \
    --overwrite_cache \

Train and test on SynthCLIP data

In [None]:
%%shell
TASK="SynthCLIP"
DATADIR="../data/tasks/SynthCLIP"
MODEL=RoBERTa-base-PM-hf
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_synth_undersampled \
    --do_train \
    --do_eval \
    --overwrite_output_dir \
    --overwrite_cache \

In [None]:
%%shell
TASK="SynthCLIP"
DATADIR="../data/tasks/SynthCLIP"
MODEL=model_synth_undersampled
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_synth_undersampled \
    --do_test \
    --overwrite_output_dir \
    --overwrite_cache \

### CLIP experiments

In [None]:
!wget -r -N -c -np --user sathomas --ask-password https://physionet.org/files/mimic-iii-clinical-action/1.0.0/

Base model for CLIP test

In [None]:
%%shell
TASK="CLIP"
DATADIR="../data/tasks/CLIP"
MODEL=RoBERTa-base-PM-hf
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir ${MODEL} \
    --do_test \
    --overwrite_output_dir \
    --overwrite_cache \

Train CLIP model for CLIP task

In [None]:
%%shell
TASK="CLIP"
DATADIR="../data/tasks/CLIP"
MODEL=model_clip_undersampled
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_clip_undersampled \
    --do_train \
    --do_eval \
    --overwrite_output_dir 

In [None]:
%%shell
TASK="CLIP"
DATADIR="../data/tasks/CLIP"
MODEL=model_clip_undersampled
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_clip_undersampled \
    --do_test \
    --overwrite_output_dir 

test CLIP model on synth test

In [None]:
%%shell
TASK="SynthCLIP"
DATADIR="../data/tasks/SynthCLIP"
MODEL=model_clip_undersampled
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_clip_undersampled \
    --do_test \
    --overwrite_output_dir \

Test SynthCLIP model on CLIP task

In [None]:
%%shell
TASK="CLIP"
DATADIR="../data/tasks/CLIP"
MODEL=model_synth_undersampled
MODEL_TYPE=roberta
python -m biolm.run_classification \
    --task_name ${TASK}\
    --data_dir ${DATADIR}\
    --model_type ${MODEL_TYPE}\
    --model_name_or_path ${MODEL}\
    --tokenizer_name roberta-base \
    --output_dir model_synth_undersampled \
    --do_test \
    --overwrite_output_dir \

## i2b2 concept extraction task

Import data and preprocess

In [None]:
%%shell
cd data
tar -zxvf concept_assertion_relation_training_data.tar.gz
cd ..

In [None]:
%%shell
cd data
tar -zxvf reference_standard_for_test_data.tar.gz
cd ..

In [None]:
%%shell
cd bio-lm
python preprocessing/preprocess_i2b2_2010_ner.py \
    --beth_dir ../data/concept_assertion_relation_training_data/beth \
    --partners_dir ../data/concept_assertion_relation_training_data/partners \
    --test_txt_dir ../data/test_data \
    --test_dir ../data/reference_standard_for_test_data \
    --task_dir ../data/tasks/i2b2

Test base model on i2b2 task

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=RoBERTa-base-PM
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir base_ft_i2b2 \
    --max_seq_length  512 \
    --per_gpu_train_batch_size 8 \
    --save_steps 500 \
    --seed 10 \
    --gradient_accumulation_steps 4 \
    --do_predict \
    --overwrite_output_dir \
    --overwrite_cache \

Train and test Synth-trained RoBERTa on i2b2 task

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=RoBERTa-base-PM-hf
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir base_ft_i2b2 \
    --max_seq_length  512 \
    --num_train_epochs 3 \
    --per_gpu_train_batch_size 8 \
    --save_steps 500 \
    --seed 10 \
    --gradient_accumulation_steps 4 \
    --do_train \
    --do_eval \
    --eval_all_checkpoints \
    --evaluate_during_training \
    --overwrite_output_dir \
    --overwrite_cache \

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=base_ft_i2b2
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir base_ft_i2b2 \
    --max_seq_length  512 \
    --save_steps 500 \
    --seed 10 \
    --do_predict \

test on trained-on-synth model

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=roberta-trained-on-synth-mlm
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir bio-lm/mlm_ft_i2b2 \
    --max_seq_length  512 \
    --num_train_epochs 3 \
    --per_gpu_train_batch_size 8 \
    --save_steps 500 \
    --seed 10 \
    --gradient_accumulation_steps 4 \
    --do_train \
    --do_eval \
    --eval_all_checkpoints\
    --overwrite_output_dir\
    --overwrite_cache \

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=../roberta-trained-on-synth-mlm
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir mlm_ft_i2b2 \
    --max_seq_length  512 \
    --save_steps 500 \
    --seed 10 \
    --do_predict \

Train and test Synthea-trained RoBERTa on i2b2 task

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=../roberta-trained-on-synthea-notes-mlm2
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir bio-lm/mlm2_ft_i2b2 \
    --max_seq_length  512 \
    --num_train_epochs 3 \
    --per_gpu_train_batch_size 8 \
    --save_steps 500 \
    --seed 10 \
    --gradient_accumulation_steps 4 \
    --do_train \
    --do_eval \
    --eval_all_checkpoints \
    --overwrite_output_dir \
    --overwrite_cache \

In [None]:
%%shell
TASK="i2b2"
DATADIR="../data/tasks/i2b2/merged"
MODEL=mlm2_ft_i2b2
MODEL_TYPE=roberta
python -m biolm.run_sequence_labelling \
    --data_dir ${DATADIR} \
    --model_type ${MODEL_TYPE} \
    --labels ${DATADIR}/../mergedlabels.txt \
    --model_name_or_path ${MODEL} \
    --tokenizer_name roberta-base \
    --output_dir mlm2_ft_i2b2 \
    --max_seq_length  512 \
    --save_steps 500 \
    --seed 10 \
    --do_predict\
    --eval_all_checkpoints \
    --overwrite_cache \