In [None]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
from torch.optim import lr_scheduler
from torch.autograd import Variable 
import copy
import random
import seaborn as sns
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from datetime import datetime, timedelta

In [None]:
import pickle

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
torch.cuda.is_available()

In [None]:
torch.manual_seed(666)
torch.cuda.manual_seed(666)
np.random.seed(666)
random.seed(42)
torch.backends.cudnn.deterministic = True

In [None]:
# pd.read_csv('final_df.csv').groupby(['SUBJECT_ID','ICUSTAY_ID','HADM_ID', 'rel_stay_diff','read_binary_90day']).agg(
#     lambda x: '\n '.join(x)).reset_index().to_csv('final_concat_df.csv', index=False)

In [None]:
nrows = 'all_rows'

In [None]:
df = pd.read_csv('final_concat_df.csv')

In [None]:
df['read_binary_90day'].value_counts()

In [None]:
df.head()

In [None]:
df['ICUSTAY_ID'].value_counts()

In [None]:
# make data loader with full data

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# remove dead patients

In [None]:
pat = pd.read_csv('../physionet.org/files/mimiciii/1.4/PATIENTS.csv')

In [None]:
pat.head()

In [None]:
pat.shape

In [None]:
stays = pd.read_csv('../physionet.org/files/mimiciii/1.4/ICUSTAYS.csv')

In [None]:
stays.head()

In [None]:
exclude_stay = stays.merge(pat, on='SUBJECT_ID')

In [None]:
exclude_stay.head()

In [None]:
exclude_stay.shape

In [None]:
exclude_stay['OUTTIME'] = pd.to_datetime(exclude_stay['OUTTIME'])

In [None]:
exclude_stay['INTIME'] = pd.to_datetime(exclude_stay['INTIME'])

In [None]:
exclude_stay['DOB'] = pd.to_datetime(exclude_stay['DOB'])
exclude_stay['DOD'] = pd.to_datetime(exclude_stay['DOD'])

In [None]:
exclude_stay['90day_after_out'] = exclude_stay['OUTTIME'] + timedelta(days=90)

In [None]:
exclude_stay = exclude_stay[(~((exclude_stay['DOD'] > exclude_stay['INTIME']) & 
             (exclude_stay['DOD'] <= exclude_stay['90day_after_out']))) |
              (df['read_binary_90day'] ==1)]

In [None]:
exclude_stay.shape

In [None]:
# did not die before ICU stay started
exclude_stay = exclude_stay[~(exclude_stay['DOD'] < exclude_stay['INTIME'])]

In [None]:
# # was not born within the 10 days prior to intime 
# exclude_stay = exclude_stay[~(exclude_stay['DOB'] >= exclude_stay['INTIME'] - timedelta(days=10))]

In [None]:
exclude_stay.shape

In [None]:
exclude_stay[exclude_stay['ROW_ID_x'].isna()]

In [None]:
mort_check = df.merge(exclude_stay[['ROW_ID_x', 'ICUSTAY_ID']], how='left', on = 'ICUSTAY_ID' )

In [None]:
mort_check.shape

In [None]:
mort_check[~mort_check['ROW_ID_x'].isna()].index

In [None]:
df = mort_check[~mort_check['ROW_ID_x'].isna()]

In [None]:
df.shape

In [None]:
# create even assignments

In [None]:
df['read_binary_90day'].value_counts()

In [None]:
# https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

In [None]:
# Class count
count_class_0, count_class_1 = df['read_binary_90day'].value_counts()

# Divide by class
df_class_0 = df[df['read_binary_90day'] == 0]
df_class_1 = df[df['read_binary_90day'] == 1]

In [None]:
count_class_0

In [None]:
count_class_1

In [None]:
df_class_0_under = df_class_0.sample(count_class_1, random_state=666)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [None]:
df_test_under['read_binary_90day'].value_counts()

In [None]:
df_test_under.index

In [None]:
df = df_test_under

In [None]:
df.to_csv('downsampled_df.csv')

In [None]:
#tokenize

In [None]:
df['TEXT'].apply(lambda x: len(x)).describe([.75,.8,.9])

In [None]:
df['TEXT'].apply(lambda x: len(x)).describe([.75,.8,.9])[-2]

In [None]:
sentence_max = int(df['TEXT'].apply(lambda x: len(x)).describe([.75,.8,.9])[-2])
# this will cover 90% of all notes

In [None]:
sentence_max

In [None]:
sentences = df['TEXT'].apply(lambda x: x[:sentence_max]  if len(x) > sentence_max else x).values

In [None]:
labels = df['read_binary_90day'].values

In [None]:
# data loader

In [None]:
#https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [None]:
# Store the model we want to use
MODEL_NAME = "bert-base-uncased"

# We need to create the model and tokenizer
# model = AutoModel.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

In [None]:
# Print the original sentence.
print(' Original: ', sentences[3])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[3]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[3])))

In [None]:
# ## Tokenize all of the sentences and map the tokens to thier word IDs. 
# Did this in the tokenizer notebook to take advantage of multiple machines
# input_ids = []
# attention_masks = []

# LOG_EVERY_N = 100
# counting = 0
# # For every sentence...
# for sent in sentences:
#     # `encode_plus` will:
#     #   (1) Tokenize the sentence.
#     #   (2) Prepend the `[CLS]` token to the start.
#     #   (3) Append the `[SEP]` token to the end.
#     #   (4) Map tokens to their IDs.
#     #   (5) Pad or truncate the sentence to `max_length`
#     #   (6) Create attention masks for [PAD] tokens.
#     encoded_dict = tokenizer.encode_plus(
#                         sent,                      # Sentence to encode.
#                         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                         max_length = 512,           # Pad & truncate all sentences.
#                         pad_to_max_length = True,
#                         return_attention_mask = True,   # Construct attn. masks.
#                         return_tensors = 'pt',     # Return pytorch tensors.
#                    )
    
#     # Add the encoded sentence to the list.    
#     input_ids.append(encoded_dict['input_ids'])
    
#     # And its attention mask (simply differentiates padding from non-padding).
#     attention_masks.append(encoded_dict['attention_mask'])

#     if (counting % LOG_EVERY_N) == 0:
#         print (f'logging: ...{str(counting)} sentences at {datetime.now()}')
# #     if (counting % LOG_EVERY_N*10) == 0:
# #         file_out = open(f'input_ids_sentences_{str(counting)}.pk', "wb")
# #         pickle.dump(input_ids, file_out)
# #         file_out = open(f'attention_masks_sentences_{str(counting)}.pk', "wb")
# #         pickle.dump(attention_masks, file_out)
        
#     counting+=1

# # Convert the lists into tensors.
# input_ids = torch.cat(input_ids, dim=0)
# attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels)

In [None]:
# file_out = open(f'input_ids_nrows_{str(nrows)}.pk', "wb")

In [None]:
# pickle.dump(input_ids, file_out)

In [None]:
# file_out = open(f'attention_masks_nrows_{str(nrows)}.pk', "wb")

In [None]:
# pickle.dump(attention_masks, file_out)

In [None]:
with open('token_output/input_ids_sentences_14800.pk', 'rb') as pickle_file:
    input_ids_test = pickle.load(pickle_file)

In [None]:
len(input_ids_test)

In [None]:
with open('token_output/input_ids_token_processing_sentences_14800_15000.pk', 'rb') as pickle_file:
    input_ids_test2 = pickle.load(pickle_file)

In [None]:
input_ids_test[-2] == input_ids_test2[0]

In [None]:
#overlaps by 2

In [None]:
master_input_ids = input_ids_test[:-2]

In [None]:
master_input_ids = master_input_ids + input_ids_test2

In [None]:
step=1000

In [None]:
for x in range(15000, 61134, step):
    print (f'batch starting row {x}')
    if x == 61000:
        nrows= 61134 - 61000 + 1
    else:
        nrows= step
    with open(f'token_output/input_ids_token_processing_sentences_{str(x)}_{str(x+nrows)}.pk', 'rb') as pickle_file:
        master_input_ids+=pickle.load(pickle_file)
        

In [None]:
len(master_input_ids)

In [None]:
with open('token_output/attention_masks_sentences_14700.pk', 'rb') as pickle_file:
    master_attention_masks = pickle.load(pickle_file)

In [None]:
len(master_attention_masks)

In [None]:
with open('token_output/attention_masks_token_processing_sentences_14700_15000.pk', 'rb') as pickle_file:
    attention_masks_test = pickle.load(pickle_file)

In [None]:
master_attention_masks[-2] == attention_masks_test[0]

In [None]:
master_attention_masks = master_attention_masks[:-2]

In [None]:
master_attention_masks+=attention_masks_test

In [None]:
step=1000

In [None]:
for x in range(15000, 61134, step):
    print (f'batch starting row {x}')
    if x == 61000:
        nrows= 61134 - 61000 + 1
    else:
        nrows= step
    with open(f'token_output/attention_masks_token_processing_sentences_{str(x)}_{str(x+nrows)}.pk', 'rb') as pickle_file:
        master_attention_masks+=pickle.load(pickle_file)
        

In [None]:
len(master_attention_masks)

In [None]:
file_out = open(f'master_attention_masks.pk', "wb")
pickle.dump(master_attention_masks, file_out)

In [None]:
file_out = open(f'master_input_ids.pk', "wb")
pickle.dump(master_input_ids, file_out)

In [None]:
index_keep_mort_check = mort_check[~mort_check['ROW_ID_x'].isna()].index

In [None]:
index_keep_mort_check

In [None]:
len([master_input_ids[i] for i in index_keep_mort_check])

In [None]:
master_input_ids_mort_filt = [master_input_ids[i] for i in index_keep_mort_check]

In [None]:
master_attention_masks_mort_filt = [master_attention_masks[i] for i in index_keep_mort_check]

In [None]:
file_out = open(f'master_input_ids_mort_filt.pk', "wb")
pickle.dump(master_input_ids_mort_filt, file_out)
file_out = open(f'master_attention_masks_mort_filt.pk', "wb")
pickle.dump(master_attention_masks_mort_filt, file_out)

In [None]:
# only use records that did not have mortality and applied to the balanced set

In [None]:
master_attention_masks = [master_attention_masks[i] for i in df.index]

In [None]:
master_input_ids = [master_input_ids[i] for i in df.index]

In [None]:
len(master_input_ids)

In [None]:
len(master_attention_masks)

In [None]:
labels = df['read_binary_90day'].values

In [None]:
len(labels)

In [None]:
# Convert the lists into tensors.
input_ids = torch.cat(master_input_ids, dim=0)
attention_masks = torch.cat(master_attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
t_dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
len(t_dataset)

In [None]:
splits = [int(len(df)*.6)+1, int(len(df)*.2)+1, int(len(df)*.2)]

In [None]:
sum(splits)

In [None]:
splits

In [None]:
train, test, val = random_split(t_dataset, splits)

In [None]:
data_dict = {'train':train, 'test':test, 'val': val}

In [None]:
data_dict.keys()

In [None]:
batch_size = 16

In [None]:
dataloaders = {}
for x in data_dict.keys():
    dataloaders[x] = DataLoader(
                data_dict[x], # The validation samples.
                sampler = SequentialSampler(data_dict[x]), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )

In [None]:
dataset_sizes = dict(zip(['train', 'val', 'test'], splits))

In [None]:
dataloaders

In [None]:
dataset_sizes

In [None]:
#verified the same data sizes are loaded

In [None]:
MODEL_NAME

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
# first iteration of models without balanced data

In [None]:
def train_BERT(model, dataloaders, 
          learning_rate=2e-5,  # args.learning_rate - default is 5e-5
               num_epoch=25
         ):
    # Training steps
    start_time = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    acc_dict = {'train':[],'validation':[]}
    loss_dict = {'train':[],'validation':[]}

    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
    
    for epoch in range(num_epoch):
        for phase in ['train','val']:
            if phase == 'train':
                model.train(True)
                total_steps = len(dataloaders[phase]) * num_epoch

                # Create the learning rate scheduler.
                scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
            else:
                model.train(False)
                
            running_loss = 0.0
            running_corrects = 0 

            for i, (data, b_input_mask, labels) in enumerate(dataloaders[phase]):
                data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
                (loss, outputs) = model(data, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=labels)
                model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
                _, preds = torch.max(outputs, dim = 1)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels).item()

        # Evaluate after every epochh         
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]
        
            if phase == 'train':
                loss_dict['train'].append(epoch_loss)
                acc_dict['train'].append(epoch_acc)
            else:
                loss_dict['validation'].append(epoch_loss)
                acc_dict['validation'].append(epoch_acc)
                    
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, f'model/full_data_readmission_bert_epoch{epoch + 1}.pth')
#                 scheduler.step(epoch_loss)

            print('{} set | epoch: {}/{} | Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch + 1, num_epoch, epoch_loss, epoch_acc))  
            time_elapsed = time.time() - start_time
            print('Training time so far: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))

            
    time_elapsed = time.time() - start_time
    print('Training time: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    for i, phase in enumerate(['train','validation']):

        fig = plt.figure()
        
        a = fig.add_subplot(2,2,2*i+1)
        plt.plot(loss_dict[phase])
        plt.title('Loss per epoch for ' + phase)

        a = fig.add_subplot(2,2,2*i+2)
        plt.plot(acc_dict[phase])
        plt.title('Accuracy per epoch for ' + phase)
        plt.show()

    model.load_state_dict(best_model_wts)            

    # test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            _, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
        elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f} | time elapse: {:>9}'.format(
            acc, elapse))
    
    return {'Model': model, 'LossDict': loss_dict, 'AccDict': acc_dict, 
            'test_predictions': predictions, 'test_pred_prob': outputs, 'test_truths': truths}

In [None]:
torch.manual_seed(666)
Bestmodel = train_BERT(model, dataloaders)

In [None]:
confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
labels=None, sample_weight=None, normalize=None)

In [None]:
confusion

In [None]:
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('test set confusion matrix')

In [None]:
confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
labels=None, sample_weight=None, normalize='true')

In [None]:
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('normalized confusion matrix')

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model2 = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model2.cuda()

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model2.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
def train_BERT_2(model, dataloaders, 
          learning_rate=5e-5,  # args.learning_rate - default is 5e-5
               num_epoch=3
         ):
    # Training steps
    start_time = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    acc_dict = {'train':[],'validation':[]}
    loss_dict = {'train':[],'validation':[]}

    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
    
    for epoch in range(num_epoch):
        for phase in ['train','val']:
            if phase == 'train':
                model.train(True)
                total_steps = len(dataloaders[phase]) * num_epoch

                # Create the learning rate scheduler.
                scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
            else:
                model.train(False)
                
            running_loss = 0.0
            running_corrects = 0 

            for i, (data, b_input_mask, labels) in enumerate(dataloaders[phase]):
                data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
                (loss, outputs) = model(data, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=labels)
                model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
                _, preds = torch.max(outputs, dim = 1)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels).item()

        # Evaluate after every epochh         
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]
        
            if phase == 'train':
                loss_dict['train'].append(epoch_loss)
                acc_dict['train'].append(epoch_acc)
            else:
                loss_dict['validation'].append(epoch_loss)
                acc_dict['validation'].append(epoch_acc)
                    
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, f'model/full_data_read_bert_lr_{learning_rate}_numepoch_{num_epoch}_currepoch{epoch + 1}.pth')
#                 scheduler.step(epoch_loss)

            print('{} set | epoch: {}/{} | Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch + 1, num_epoch, epoch_loss, epoch_acc))  
            time_elapsed = time.time() - start_time
            print('Training time so far: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))

            
    time_elapsed = time.time() - start_time
    print('Training time: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    for i, phase in enumerate(['train','validation']):

        fig = plt.figure()
        
        a = fig.add_subplot(2,2,2*i+1)
        plt.plot(loss_dict[phase])
        plt.title('Loss per epoch for ' + phase)

        a = fig.add_subplot(2,2,2*i+2)
        plt.plot(acc_dict[phase])
        plt.title('Accuracy per epoch for ' + phase)
        plt.show()

    model.load_state_dict(best_model_wts)            

    # test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            _, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
        elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f} | time elapse: {:>9}'.format(
            acc, elapse))
    
    return {'Model': model, 'LossDict': loss_dict, 'AccDict': acc_dict, 
            'test_predictions': predictions, 'test_pred_prob': outputs, 'test_truths': truths}

In [None]:
torch.manual_seed(666)
Bestmodel = train_BERT_2(model2, dataloaders)

In [None]:
confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
labels=None, sample_weight=None, normalize=None)

In [None]:
confusion

In [None]:
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('test set confusion matrix')

In [None]:
confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
labels=None, sample_weight=None, normalize='true')

In [None]:
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('normalized confusion matrix')

In [None]:
torch.manual_seed(666)

for learn_opt in [5e-5, 3e-5, 2e-5]:
    for epoch_opt in [3, 4, 5, 6]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        Bestmodel = train_BERT_2(model2, dataloaders, learning_rate=learn_opt,
               num_epoch=3)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model3 = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model3.cuda()

In [None]:
torch.manual_seed(666)

for learn_opt in [5e-5, 3e-5, 2e-5]:
    for epoch_opt in [4, 5, 6]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model3 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model3.cuda()
        
        
        
        
        Bestmodel = train_BERT_2(model3, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
torch.manual_seed(666)

for learn_opt in [3e-5, 2e-5]:
    for epoch_opt in [6]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model3 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model3.cuda()
        
        
        
        
        Bestmodel = train_BERT_2(model3, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
torch.manual_seed(666)

for learn_opt in [3e-5]:
    for epoch_opt in [3, 4, 5]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model4 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model4.cuda()
        
        
        
        
        Bestmodel = train_BERT_2(model4, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
torch.manual_seed(666)

for learn_opt in [2e-5]:
    for epoch_opt in [3, 4, 5]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model4 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model4.cuda()
        
        
        
        
        Bestmodel = train_BERT_2(model4, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model_load = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model_load.cuda()
model_load.load_state_dict(torch.load('model/full_data_read_bert_lr_2e-05_numepoch_3_currepoch2.pth'))
model_load.eval()

In [None]:
def model_test (model):
    # test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            _, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
#         elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f}'.format(
            acc))

    return {'test_predictions': predictions, 'test_truths': truths}

In [None]:
Bestmodel_preds = model_test(model_load)

In [None]:
# pulled most successful model back in to create some of the output visuals

In [None]:
confusion = confusion_matrix(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
print (confusion)
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('test set confusion matrix')
plt.show()
confusion = confusion_matrix(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'],
labels=None, sample_weight=None, normalize='true')
print (confusion)
df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('normalized confusion matrix')
plt.show()

In [None]:
roc_auc_score(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'])

In [None]:
torch.manual_seed(666)

for learn_opt in [2e-5]:
    for epoch_opt in [15]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model4 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model4.cuda()
        
        
        
        
        Bestmodel = train_BERT_2(model4, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
# models trained with balanced data

In [None]:
def train_BERT_3(model, dataloaders, 
          learning_rate=5e-5,  # args.learning_rate - default is 5e-5
               num_epoch=3
         ):
    # Training steps
    start_time = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    acc_dict = {'train':[],'validation':[]}
    loss_dict = {'train':[],'validation':[]}

    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
    
    for epoch in range(num_epoch):
        for phase in ['train','val']:
            if phase == 'train':
                model.train(True)
                total_steps = len(dataloaders[phase]) * num_epoch

                # Create the learning rate scheduler.
                scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
            else:
                model.train(False)
                
            running_loss = 0.0
            running_corrects = 0 

            for i, (data, b_input_mask, labels) in enumerate(dataloaders[phase]):
                data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
                (loss, outputs) = model(data, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=labels)
                model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
                _, preds = torch.max(outputs, dim = 1)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels).item()

        # Evaluate after every epochh         
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]
        
            if phase == 'train':
                loss_dict['train'].append(epoch_loss)
                acc_dict['train'].append(epoch_acc)
            else:
                loss_dict['validation'].append(epoch_loss)
                acc_dict['validation'].append(epoch_acc)
                    
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, f'model/full_data_read_bert_lr_{learning_rate}_numepoch_{num_epoch}_currepoch{epoch + 1}.pth')
#                 scheduler.step(epoch_loss)

            print('{} set | epoch: {}/{} | Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch + 1, num_epoch, epoch_loss, epoch_acc))  
            time_elapsed = time.time() - start_time
            print('Training time so far: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))

            
    time_elapsed = time.time() - start_time
    print('Training time: {}minutes {}s'.format(int(time_elapsed / 60), time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    for i, phase in enumerate(['train','validation']):

        fig = plt.figure()
        
        a = fig.add_subplot(2,2,2*i+1)
        plt.plot(loss_dict[phase])
        plt.title('Loss per epoch for ' + phase)

        a = fig.add_subplot(2,2,2*i+2)
        plt.plot(acc_dict[phase])
        plt.title('Accuracy per epoch for ' + phase)
        plt.show()

    model.load_state_dict(best_model_wts)            

    # test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    pred_prob_lst = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            pred_prob, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            pred_prob_lst += list(pred_prob.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
        elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f} | time elapse: {:>9}'.format(
            acc, elapse))
    
    return {'Model': model, 'LossDict': loss_dict, 'AccDict': acc_dict, 
            'test_predictions': predictions, 'test_pred_prob': pred_prob_lst, 'test_truths': truths}


In [None]:
torch.manual_seed(666)

for learn_opt in [2e-5]:
    for epoch_opt in [6]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model5 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model5.cuda()
        
        
        Bestmodel = train_BERT_3(model5, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
torch.manual_seed(666)

for learn_opt in [2e-5]:
    for epoch_opt in [5]:
        print (f'learning rate of {learn_opt}, count of epochs {epoch_opt}')
        
        # Load BertForSequenceClassification, the pretrained BERT model with a single 
        # linear classification layer on top. 
        model5 = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )

        # Tell pytorch to run this model on the GPU.
        model5.cuda()
        
        
        Bestmodel = train_BERT_3(model5, dataloaders, learning_rate=learn_opt,
               num_epoch=epoch_opt)
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
                labels=None, sample_weight=None, normalize=None)
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('test set confusion matrix')
        plt.show()
        confusion = confusion_matrix(Bestmodel['test_truths'], Bestmodel['test_predictions'],
        labels=None, sample_weight=None, normalize='true')
        print (confusion)
        df_cm = pd.DataFrame(confusion, index=['not readmitted', 'readmitted'], columns=['not readmitted', 'readmitted'])
        plt.figure(figsize = (10,7))
        sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('normalized confusion matrix')
        plt.show()

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model_load = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model_load.cuda()
model_load.load_state_dict(torch.load('model/full_data_read_bert_lr_2e-05_numepoch_6_currepoch2.pth'))
model_load.eval()

In [None]:
def test_model_with_pred(model):
# test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    pred_prob_lst = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            pred_prob, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            pred_prob_lst += list(pred_prob.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
#         elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f}'.format(
            acc))
    
    return {'Model': model, 
            'test_predictions': predictions, 'test_pred_prob': pred_prob_lst, 'test_truths': truths}

In [None]:
Bestmodel_preds = test_model_with_pred(model_load)

In [None]:
roc_auc_score(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'])

In [None]:
from sklearn.metrics import roc_curve
import scikitplot as skplt

In [None]:
fpr , tpr , thresholds = roc_curve(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'])

plt.plot(fpr,tpr) 
plt.axis([0,1,0,1]) 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.show()   

In [None]:
skplt.metrics.plot_precision_recall_curve(np.array(Bestmodel_preds['test_truths']), 
                                          np.array(list(zip(Bestmodel_preds['test_pred_prob'], 
                                                    Bestmodel_preds['test_predictions']))))

In [None]:
skplt.metrics.plot_roc_curve(np.array(Bestmodel_preds['test_truths']), np.array(list(zip(Bestmodel_preds['test_pred_prob'], Bestmodel_preds['test_predictions']))))
plt.show()

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model_load = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model_load.cuda()
model_load.load_state_dict(torch.load('model/full_data_read_bert_lr_2e-05_numepoch_5_currepoch2.pth'))
model_load.eval()

In [None]:
def test_model_with_pred(model):
# test set evaluation
    correct = 0
    total = 0
    model.eval()

    predictions = []
    truths = []
    pred_prob_lst = []
    data_for_example = []
    with torch.no_grad():
        for i, (data, b_input_mask, labels) in enumerate(dataloaders['test']):
            data, b_input_mask, labels = data.to(device), b_input_mask.to(device), labels.to(device)
            (loss, outputs) = model(data, 
                   token_type_ids=None, 
                   attention_mask=b_input_mask,
                   labels=labels)
            model.zero_grad()
#                 loss is automatically output from the model. So no need to run loss separately
#                 loss = loss_fn(outputs, labels)
            pred_prob, preds = torch.max(outputs, dim = 1)
            predictions += list(preds.cpu().numpy())
            pred_prob_lst += list(pred_prob.cpu().numpy())
            truths += list(labels.cpu().numpy())
            total += labels.size(0)
            correct += (preds == labels).sum()
            data_for_example.append(data)

        acc = (1.0 * correct / total)
#         elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
        print('Test set | Accuracy: {:6.4f}'.format(
            acc))
    
    return {'Model': model, 
            'test_predictions': predictions, 'test_pred_prob': pred_prob_lst, 'test_truths': truths}

In [None]:
Bestmodel_preds = test_model_with_pred(model_load)

In [None]:
roc_auc_score(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'])

In [None]:
from sklearn.metrics import roc_curve
import scikitplot as skplt

In [None]:
fpr , tpr , thresholds = roc_curve(Bestmodel_preds['test_truths'], Bestmodel_preds['test_predictions'])

plt.plot(fpr,tpr) 
plt.axis([0,1,0,1]) 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.show()   

In [None]:
skplt.metrics.plot_precision_recall_curve(np.array(Bestmodel_preds['test_truths']), 
                                          np.array(list(zip(Bestmodel_preds['test_pred_prob'], 
                                                    Bestmodel_preds['test_predictions']))))

In [None]:
skplt.metrics.plot_roc_curve(np.array(Bestmodel_preds['test_truths']), np.array(list(zip(Bestmodel_preds['test_pred_prob'], Bestmodel_preds['test_predictions']))))
plt.show()