In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0,"../src")

In [3]:
import pandas as pd
import numpy as np

from fastai2 import *
from fastai2.text.all import *
import vectorize
import helpers
import transformers
from utils import *
from loss.loss import *

In [4]:
full_df = pd.read_pickle("../data/full_df.pkl")

In [5]:
MAX_VOCAB = None
MAX_SEQ_LENGTH = 5000
full_df['TEXT_PROCESSED'] = vectorize.clean_notes(full_df, 'TEXT')
full_df['ICD9_GRP_LIST'] = full_df.ICD9_GRP.apply(lambda x: re.split(" +", x.strip()))

# Use FastAI AWD LSTM

In [6]:
from fastai2.text.all import *

In [7]:
from transformers import BertConfig, BertForSequenceClassification, BertModel, BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
 "bert-base-uncased",
)

fastai_bert_vocab =L(bert_tok.vocab.keys())

In [8]:
sample_df = full_df.sample(frac=0.3)

In [None]:
item_tfms= [[ColReader('TEXT_PROCESSED'), FastAIBertTokenizer(tokenizer=bert_tok, fill_to_max=False) ]]

# Create datasource & dataloaders 

splits = RandomSplitter()(range_of(sample_df))
dsrc = Datasets(sample_df, tfms=item_tfms, splits=splits, dl_type=LMDataLoader)
dls = dsrc.dataloaders(bs=128, seq_len=120)

In [None]:
# We will first train the language model 
dls.show_batch(max_n=2)

In [None]:
dls.vocab = fastai_bert_vocab

In [None]:
learn = language_model_learner(
    dls, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
#learn.lr_find()

# Model training

It takes quite a while to train each epoch, so we'll be saving the intermediate model results during the training process. Since fine_tune doesn't do that for us, we'll just use fit_one_cycle. Just like cnn_learner, language_model_learner automatically calls freeze when using a pretrained model (which is the default), so this will only train the embeddings (which is the only part of the model that contains randomly initialized weights--i.e. embeddings for words that are in our IMDb vocab, but aren't in the pretrained model vocab)

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.save('1epoch')
learn = learn.load('1epoch')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

learn.save_encoder('finetuned')


# Check the language model

In [None]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

print("\n".join(preds))


# Classifier model

In [None]:
item_tfms= [[ColReader('TEXT_PROCESSED'), FastAIBertTokenizer(tokenizer=bert_tok, fill_to_max=False) ],
            [ColReader('ICD9_GRP_LIST') , MultiCategorize, OneHotEncode]
           ]

# Create datasource & dataloaders 

splits = RandomSplitter()(range_of(sample_df))
dsrc = Datasets(sample_df, tfms=item_tfms, splits=splits)
dls = dsrc.dataloaders(bs=4)

In [None]:
b = dls.one_batch()

In [None]:
dls.show_batch(max_n=3)

In [None]:
# Metrics
acc_02 = partial(accuracy_multi, thresh=0.2)
f_score = F1ScoreMulti(thresh=0.2)

learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, 
                                metrics=[acc_02, f_score]).to_fp16()

In [None]:
learn = learn.load_encoder('finetuned')


## Training 

In [None]:
learn.fit_one_cycle(1, 2e-2)


In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))