#Setup

In [None]:
import pickle
import numpy as np
import pandas as pd

import os
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn
from tabulate import tabulate

import torch
import torch.nn as nn

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, hamming_loss, roc_auc_score
from scipy.special import softmax

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

#Set Variables

In [None]:
os.makedirs("_MODELS", exist_ok = True) #Folder to save all models

#Path to emotion embeddings
#File must contain vocabulary map and weights
emoEmbPath = ""

MAX_LENGTH = 256
BATCH_SIZE = 64

#Load Data

In [None]:
#Data Preparation
original_train_sentences = []
original_val_sentences = []
original_test_sentences = []

original_train_labels = []
original_val_labels = []
original_test_labels = []

assert len(original_train_sentences) == len(original_train_labels)
assert len(original_val_sentences) == len(original_val_labels)
assert len(original_test_sentences) == len(original_test_labels)

train_size = len(original_train_sentences)
val_size = len(original_val_sentences)
test_size = len(original_test_sentences)

train_idx = np.arange(0, train_size)
val_idx = np.arange(train_size, train_size + val_size)
test_idx = np.arange(train_size + val_size, train_size + val_size + test_size)

all_sentences = np.array(original_train_sentences + original_val_sentences + original_test_sentences)
all_labels = np.array(original_train_labels + original_val_labels + original_test_labels)

#Label Encoding
unique_labels = np.unique(original_train_labels)
num_class = len(unique_labels)

lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

all_targets = lEnc.transform(all_labels)

#Load Resources

In [None]:
#Load list of emoticons
#Source: https://c.r74n.com/faces

with open("TextEmoticonList.txt", "r") as file:
  emoticonList = file.read().split("\n")

#Remove emoticons with spaces in-between
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon.split(" ")) == 1]

#Remove one character emoticons
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon) > 1]

print(len(emoticonList))
print(emoticonList[:10])

In [None]:
#Load list of emojis
#Source: https://www.airtable.com/universe/exphjm5ifnV0bX4Kb/emojis-database?explore=true

emojiList = pd.read_csv("Emojis-Grid view.csv")
emojiList = emojiList[emojiList["Emoji"] != "C"]
emojiList = emojiList["Emoji"].tolist()

#Unicode versions
emojiList_uni = [emoji.encode('unicode-escape').decode('ASCII') for emoji in emojiList]

print(len(emojiList))
print(emojiList[:10])
print(emojiList_uni[:10])

# Preprocess

##Text

In [None]:
#FLAGS
DEIDENTIFY = True     #Replace urls, emails, and usernames
EMOPRESERVE = True    #Identify emojis/emoticons on text and skip text cleaning on them
TEXTCLEAN = False     #Minimal cleaning of separating certain conjunctions
TOKEN_TYPE = "wp"     #wp: word piece (BERT Tokenizer); ws: word split

In [None]:
import re

tokenURL = "_URL_"
tokenEmail = "_EMAIL_"
tokenUsername = "_USER_"
reserveTokens = [tokenURL, tokenEmail, tokenUsername]

#CLEANING PROCESS
#- Include emojis and emoticons
#- Replace url, email, and usernames with tokens
#- Remove non-major puncutations and separate them from words with whitespaces
#- Lowercase
def preprocess_str(string):

  #Preclean
  if DEIDENTIFY:
    string = re.sub(r"https?://[^\s]+", tokenURL, string)              #Links
    string = re.sub(r"[\w.+-]+@[\w-]+\.[\w.-]+", tokenEmail, string)   #Email
    string = re.sub(r"@[a-zA-Z0-9_]{2,}", tokenUsername, string)       #Usernames

  #Emoticon/Emoji split
  tokens = [string]
  if EMOPRESERVE:
    allEmo = emoticonList + emojiList + emojiList_uni + reserveTokens
    for emoticon in allEmo:
      regEx = "(^|\s)" + re.escape(emoticon) + "(\s|$)" if emoticon.isalpha() else re.escape(emoticon)
      if emoticon in string:
        splits = []
        for split in tokens:
          splits.append(re.split(r"(" + regEx + ")", split))
        tokens = [y.strip() for x in splits for y in x if y != ""]

  for idx in range(len(tokens)):
    if EMOPRESERVE and tokens[idx] in allEmo: #Skip emoticons, emojis
      continue

    if TEXTCLEAN:
      tokens[idx] = re.sub(r"[^A-Za-z0-9(),!?\.\'\`]", " ", tokens[idx])
      tokens[idx] = re.sub(r"\'s", " \'s", tokens[idx])
      tokens[idx] = re.sub(r"\'ve", " \'ve", tokens[idx])
      tokens[idx] = re.sub(r"n\'t", " n\'t", tokens[idx])
      tokens[idx] = re.sub(r"\'re", " \'re", tokens[idx])
      tokens[idx] = re.sub(r"\'d", " \'d", tokens[idx])
      tokens[idx] = re.sub(r"\'ll", " \'ll", tokens[idx])
      tokens[idx] = re.sub(r",", " , ", tokens[idx])
      tokens[idx] = re.sub(r"!", " ! ", tokens[idx])
      tokens[idx] = re.sub(r"\(", " ( ", tokens[idx])
      tokens[idx] = re.sub(r"\)", " ) ", tokens[idx])
      tokens[idx] = re.sub(r"\?", " ? ", tokens[idx])
      tokens[idx] = re.sub(r"\.", " . ", tokens[idx])
      tokens[idx] = re.sub(r"\s{2,}", " ", tokens[idx])

    #Lower case and strip by default
    tokens[idx] = tokens[idx].lower().strip()

  return " ".join(tokens)

##Tokenizer

In [None]:
def get_tokenizer(token_type, checkpoint = None):
  if token_type.lower() == "wp":
    if checkpoint in [None, "bert-base-uncased", "custom/MM-EMOG-SenticNet"]:
      from transformers import BertTokenizer
      tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    else:
      from transformers import AutoTokenizer
      tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    if DEIDENTIFY:
      tokenizer.add_tokens(reserveTokens)

    if EMOPRESERVE:
      #Add spaces to alpha emotions to avoid splitting words that commonly has them (ie "omo" in "tomorrow")
      temp = [" %s " % x if x.isalpha() else x for x in emoticonList]
      tokenizer.add_tokens(temp + emojiList + emojiList_uni)

    return tokenizer
  elif token_type.lower() == "ws":
    return string.split()
  else:
    raise Exception("Unknown value for TOKEN_TYPE")

#Models

##Pretrained LM

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import logging
logging.set_verbosity_error()

def get_plm(checkpoint, num_class, args = None):

  if checkpoint in ["bert-base-uncased", "roberta-base", "mental/mental-bert-base-uncased"]:
    config = AutoConfig.from_pretrained(checkpoint, num_labels = num_class)

    if args != None:
      config = AutoConfig.from_pretrained(checkpoint, num_labels = num_class, **args)
    else:
      config = AutoConfig.from_pretrained(checkpoint, num_labels = num_class)

    return AutoModelForSequenceClassification.from_pretrained(checkpoint, config = config)
  elif checkpoint.split("/")[0] == "custom":
    assert "pt_weights" in args
    assert "pt_weights_dim" in args

    return MLP(num_class = num_class, **args)
  else:
    raise Exception("Unknown checkpoint")

##MLP

In [None]:
class MLP(torch.nn.Module):
  def __init__(self, pt_weights, pt_weights_dim, num_class, num_layers, hidden_dim, dropout, actual_max):
    super(MLP, self).__init__()

    self.num_layers = num_layers

    #Load Embeddings
    self.embeddings = nn.Embedding.from_pretrained(pt_weights)

    #MLP
    if self.num_layers >= 2:
      self.l1 = nn.Linear(actual_max * pt_weights_dim, hidden_dim)
      self.r1 = nn.ReLU()
      self.d1 = nn.Dropout(dropout)

      moduleList = []
      for _ in range(num_layers - 2):
        moduleList.append(nn.Linear(hidden_dim, hidden_dim))
        moduleList.append(nn.ReLU())
        moduleList.append(nn.Dropout(dropout))

      self.mod_list = nn.ModuleList(moduleList)
      self.lf = nn.Linear(hidden_dim, num_class)
      self.rf = nn.ReLU()
      self.df = nn.Dropout(dropout)
    else:
      self.l1 = nn.Linear(actual_max * pt_weights_dim, num_class)
      self.r1 = nn.ReLU()
      self.d1 = nn.Dropout(dropout)
    # self.softmax = nn.Softmax()

  def forward(self, input_ids):

    #Generate embeddings
    features = self.embeddings(input_ids)      # embedded = [batch size, sent_len, emb dim]

    # features = torch.cat((bert_features, mmemog_features), axis = -1)
    x = features.view(features.shape[0], -1)  #Flatten

    if self.num_layers >= 2:
      x = self.l1(x)
      x = self.r1(x)
      x = self.d1(x)

      for i in range(self.num_layers - 2):
        x = self.mod_list[i](x)

      x = self.lf(x)
      x = self.rf(x)
      x = self.df(x)
    else:
      x = self.l1(x)
      x = self.r1(x)
      x = self.d1(x)
    # output = self.softmax(x)
    output = {"logits": x}
    return output

#Training Functions

##Initialize Model

In [None]:
def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)

class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def prepare_datasets(train_percent = 0.9, cv = False, train_idx = None, test_idx = None, show_result = False):

  np.random.seed(123)
  #Cross validation
  if cv == True:
    assert len(train_idx) > 0
    #assert len(test_idx) > 0 #If test_idx = none -> split only train/val, test with all data

    idx_train = np.random.choice(train_idx, int(len(train_idx) * train_percent), replace = False)
    idx_val = [x for x in train_idx if x not in idx_train]
    idx_test = test_idx

  #Train and test only
  elif (val_size == 0) and (test_size != 0):
    idx_train = np.random.choice(np.arange(train_size), int(train_size * train_percent), replace = False)
    idx_val = [x for x in np.arange(train_size) if x not in idx_train]
    idx_test = np.arange(train_size, len(all_sentences))

  #Train, val, and test
  elif val_size != 0:
    idx_train = np.arange(0, train_size)
    idx_val = np.arange(train_size, train_size + val_size)
    idx_test = np.arange(train_size + val_size, len(all_sentences))
  else:
    raise Exception("Unknown split.")

  print("Data Loader split")
  print("  - Train:", len(idx_train))
  print("  - Val:", len(idx_val))
  print("  - Test:", len(idx_test) if idx_test != None else 0)

  if type(all_input) == np.ndarray:
    train_input = all_input[idx_train]
    val_input = all_input[idx_val]
    test_input = all_input[idx_test]
  else:
    train_input = {key: torch.LongTensor(value)[idx_train] for key, value in all_input.items()}
    val_input = {key: torch.LongTensor(value)[idx_val] for key, value in all_input.items()}
    test_input = {key: torch.LongTensor(value)[idx_test] for key, value in all_input.items()}

  train_targets = torch.LongTensor(all_targets[idx_train])
  val_targets = torch.LongTensor(all_targets[idx_val])
  test_targets = torch.LongTensor(all_targets[idx_test])

  train_loader = torch.utils.data.DataLoader(BERTDataset(train_input, train_targets), shuffle=True, batch_size = BATCH_SIZE)
  val_loader = torch.utils.data.DataLoader(BERTDataset(val_input, val_targets), shuffle = True, batch_size = BATCH_SIZE)
  test_loader = torch.utils.data.DataLoader(BERTDataset(test_input, test_targets), shuffle = False, batch_size = BATCH_SIZE)
  print("Batch size:", BATCH_SIZE)

  if len(test_targets) == 0:
    return train_loader, val_loader
  return train_loader, val_loader, test_loader

def getEmoEmbeddings():
  with open(emoEmbPath, "rb") as file:
    content = pickle.load(file)

  return content["vocab_map"], content["weights"]

##Train

In [None]:
def collate_input(batch):
  if checkpoint.split("/")[0] != "custom":
    input_ids = batch["input_ids"].to(device)
    targets = batch["labels"].to(device)
    input_args = {"input_ids": input_ids,
                  "labels": targets}

    if "attention_mask" in batch.keys():
      attention_mask = batch["attention_mask"].to(device)
      input_args["attention_mask"] = attention_mask
    if "token_type_ids" in batch.keys():
      token_type_ids = batch["token_type_ids"].to(device)
      input_args["token_type_ids"] = token_type_ids
  else:
    input_args = {"input_ids": batch["input_ids"].to(device)}

  return input_args

In [None]:
import time
import torch.optim as optim

def train_model(show_result = True, epochs = 3, early_stop = 10):
    val_loss = []
    for epoch in range(epochs):
        t = time.time()
        model.train()

        f1_batch_train = []
        acc_batch_train = []
        loss_batch_train = []
        for batch in train_loader:
          targets = batch["labels"].to(device)
          input_args = collate_input(batch)

          output = model(**input_args)
          loss_train = criterion(output["logits"], targets)
          optimizer.zero_grad()
          loss_train.backward()
          optimizer.step()

          loss_batch_train.append(loss_train.item())
          acc_batch_train.append(cal_accuracy(output["logits"], targets))
          f1_batch_train.append(f1_score(targets.cpu(), torch.argmax(output["logits"].cpu(), axis = -1), average = "weighted"))

        model.eval()
        with torch.no_grad():
          loss_batch_val = []
          acc_batch_val = []
          f1_batch_val = []
          for batch in val_loader:
            targets = batch["labels"].to(device)
            input_args = collate_input(batch)

            output = model(**input_args)
            loss_val = criterion(output["logits"], targets)

            loss_batch_val.append(loss_val.item())
            acc_batch_val.append(cal_accuracy(output["logits"], targets))
            f1_batch_val.append(f1_score(targets.cpu(), torch.argmax(output["logits"].cpu(), axis = -1), average = "weighted"))

        val_loss.append(np.mean(loss_batch_val))

        if show_result:
            print(  'Epoch: {:04d}'.format(epoch+1),
                    'loss_train: {:.4f}'.format(np.mean(loss_batch_train)),
                    'acc_train: {:.4f}'.format(np.mean(acc_batch_train)),
                    'f1w_train: {:.4f}'.format(np.mean(f1_batch_train)),
                    'loss_val: {:.4f}'.format(np.mean(loss_batch_val)),
                    'acc_val: {:.4f}'.format(np.mean(acc_batch_val)),
                    'f1w_val: {:.4f}'.format(np.mean(f1_batch_val)),
                    'time: {:.4f}s'.format(time.time() - t), flush = True)

        if early_stop != None and epoch > early_stop and np.min(val_loss[-early_stop:]) > np.min(val_loss[:-early_stop]) :
            if show_result:
                print("Early Stopping...")
                plt.plot(val_loss)
                plt.show()
            break

#Tuning

In [None]:
import optuna

def objective(trial):

  tune_dropout = trial.suggest_categorical("dropout", [0.01, 0.05, 0.1, 0.5])
  tune_decay = trial.suggest_categorical("weight_decay", [0, 0.01, 0.1])

  if checkpoint.split("/")[0] != "custom":
    tune_lr = trial.suggest_categorical("learning_rate", [1e-04, 1e-05, 2e-05, 3e-05, 4e-05, 5e-05])
    tune_epochs = trial.suggest_int("num_epochs", 2, 5)
    tune_layers = trial.suggest_int("num_hidden_layers", 2, 12, 2)
    tune_heads = trial.suggest_categorical("num_attention_heads", [ 2,  3,  4,  6,  8, 12]) #choose num heads % 768
    early_stop = None

    if checkpoint.split("/")[0] == "medicalai":
      args = {"num_hidden_layers": tune_layers,
              "num_attention_heads": tune_heads,
              "dropout": tune_dropout,
              "attention_dropout": tune_dropout,
              "hidden_act": "relu"}
    else:
      args = {
              "num_hidden_layers": tune_layers,
              "num_attention_heads": tune_heads,
              "hidden_dropout_prob": tune_dropout,
              "attention_dropout_prob": tune_dropout,
              "hidden_act": "relu"
              }
    tune_model = get_plm(checkpoint, num_class, args).to(device)
    tune_model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
  else: #MLP default
    tune_layers = trial.suggest_int("num_hidden_layers", 2, 5)
    tune_lr = trial.suggest_categorical("learning_rate", [1e-03,1e-04, 1e-05])
    tune_hidden_dim = trial.suggest_int("hidden_dim", 100, 500, 100)

    tune_epochs = 100
    early_stop = 10

    args = {"pt_weights": pt_weights,
            "pt_weights_dim": pt_weights_dim,
            "num_layers": tune_layers,
            "hidden_dim": tune_hidden_dim,
            "dropout": tune_dropout,
            "actual_max": actual_max}
    tune_model = get_plm(checkpoint, num_class, args).to(device)

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(tune_model.parameters(), lr = tune_lr, weight_decay = tune_decay)

  #Training
  val_loss = []
  for epoch in range(tune_epochs):
    t = time.time()
    tune_model.train()

    for batch in train_loader:
      targets = batch["labels"].to(device)
      input_args = collate_input(batch)

      output = tune_model(**input_args)
      loss_train = criterion(output["logits"], targets)
      optimizer.zero_grad()
      loss_train.backward()
      optimizer.step()

    tune_model.eval()
    with torch.no_grad():
      f1_batch_val = []
      loss_batch_val = []
      for batch in val_loader:
        targets = batch["labels"].to(device)
        input_args = collate_input(batch)

        output = tune_model(**input_args)
        loss_val = criterion(output["logits"], targets)

        loss_batch_val.append(loss_val.item())
        f1_batch_val.append(f1_score(targets.cpu(), torch.argmax(output["logits"].cpu(), axis = -1), average = "weighted"))

    val_loss.append(np.mean(loss_batch_val))
    f1_val = np.mean(f1_batch_val)

    #Record metric
    trial.report(f1_val, epoch)

    if early_stop != None and epoch > early_stop and np.min(val_loss[-early_stop:]) > np.min(val_loss[:-early_stop]) :
      break

    # Handle pruning based on the intermediate value.
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

  return f1_val

In [None]:
def tune_parameters(n_trials = 50):

  study = optuna.create_study(direction = "maximize")

  if checkpoint.split("/")[0] != "custom":
    study.enqueue_trial({"dropout": 0.1,  #default parameters
                        "num_hidden_layers": 12,
                        "num_attention_heads": 12,
                        "learning_rate": 1e-05,
                        "weight_decay": 0,
                        "num_epochs": 3})
  else:
    study.enqueue_trial({"dropout": 0.1,  #default parameters
                        "num_hidden_layers": 3,
                        "learning_rate": 1e-05,
                        "weight_decay": 0,
                        "hidden_dim": 200})
  study.optimize(objective, n_trials = n_trials)

  return study

#Train Teachers

In [None]:
clean_sentences = [preprocess_str(sent) for sent in tqdm(all_sentences)]

##MMEMOG

In [None]:
print()
print("=" * 50)
print("=" * 20, "MM-EMOG", "=" * 20)
print("=" * 50)
print()

#SETUP
checkpoint = "custom/MM-EMOG-SenticNet"
filePath_model = "_MODELS/%s.pt" % ( checkpoint.replace("/", "_"))

#Load embeddings
mmemog_map, mmemog_weights = getEmoEmbeddings()

#Build tokenizer
tokenizer = get_tokenizer("wp", "bert-base-uncased")
tokenizer.add_tokens(mmemog_map.keys())

#Align tokenizer and mmemog weights
np.random.seed(123)
pt_weights_dim = mmemog_weights.shape[-1]
pt_weights = np.random.random((max(len(tokenizer.get_vocab()), len(mmemog_map.keys())), pt_weights_dim))

for key, val in tokenizer.get_vocab().items():
  if key in mmemog_map:
    pt_weights[val] = mmemog_weights[mmemog_map[key]]
  # else keep randomly generated weights
pt_weights = torch.FloatTensor(pt_weights).to(device)
assert pt_weights.shape[0] == len(tokenizer.get_vocab())

#Create input
actual_max = min(MAX_LENGTH, max([len(tokenizer.tokenize(x)) for x in clean_sentences]))
all_input = tokenizer(clean_sentences, padding = True, truncation = True, max_length = actual_max)
print("Actual max length:", actual_max)

#Create loaders
train_loader, val_loader, _ = prepare_datasets(cv = True, train_idx = np.arange(len(clean_sentences)))

#TUNE
start = datetime.now()
print("Tuning...", flush = True)
study = tune_parameters(50)
print("Total tuning time: %s\n" % (datetime.now() - start), flush = True)

best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

#TRAIN
criterion = nn.CrossEntropyLoss()
args = {"pt_weights": pt_weights,
        "pt_weights_dim": pt_weights_dim,
        "num_layers": best_params["num_hidden_layers"],
        "hidden_dim": best_params["hidden_dim"],
        "dropout": best_params["dropout"],
        "actual_max": actual_max}

model = get_plm(checkpoint, num_class, args).to(device)
# model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
optimizer = optim.Adam(model.parameters(), lr = best_params["learning_rate"], weight_decay = best_params["weight_decay"])

print("=" * 20, "MODEL CONFIG", "=" * 20)
print(model)

train_model(epochs = 100, early_stop = 10)

torch.save(model, filePath_model) #Save model

##BERT

In [None]:
print()
print("=" * 50)
print("=" * 20, "BERT", "=" * 20)
print("=" * 50)
print()

#SETUP
checkpoint = "bert-base-uncased"
filePath_model = "_MODELS/%s.pt" % ( checkpoint.replace("/", "_"))

tokenizer = get_tokenizer("wp", checkpoint)

#Create input
actual_max = min(MAX_LENGTH, max([len(tokenizer.tokenize(x)) for x in clean_sentences]))
all_input = tokenizer(clean_sentences, padding = True, truncation = True, max_length = actual_max)
print("Actual max length:", actual_max)

#Create loaders
train_loader, val_loader, _ = prepare_datasets(cv = True, train_idx = np.arange(len(clean_sentences)))

#TUNE
start = datetime.now()
print("Tuning...", flush = True)
study = tune_parameters(50)
print("Total tuning time: %s\n" % (datetime.now() - start), flush = True)

best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

#TRAIN
criterion = nn.CrossEntropyLoss()
args = {"num_hidden_layers": best_params["num_hidden_layers"],
        "num_attention_heads": best_params["num_attention_heads"],
        "hidden_dropout_prob": best_params["dropout"],
        "attention_dropout_prob": best_params["dropout"],
        "hidden_act": "relu"}

model = get_plm(checkpoint, num_class, args).to(device)
model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
optimizer = optim.Adam(model.parameters(), lr = best_params["learning_rate"], weight_decay = best_params["weight_decay"])

print("=" * 20, "MODEL CONFIG", "=" * 20)
print(model.config)

train_model(epochs = best_params["num_epochs"])

torch.save(model, filePath_model) #Save model
#uploadFile(filePath_model, filePath_model.split("/")[-1])

##RoBERTa

In [None]:
print()
print("=" * 50)
print("=" * 20, "RoBERTa", "=" * 20)
print("=" * 50)
print()

#SETUP
checkpoint = "roberta-base"
filePath_model = "_MODELS/%s.pt" % ( checkpoint.replace("/", "_"))

tokenizer = get_tokenizer("wp", checkpoint)

#Create input
actual_max = min(MAX_LENGTH, max([len(tokenizer.tokenize(x)) for x in clean_sentences]))
all_input = tokenizer(clean_sentences, padding = True, truncation = True, max_length = actual_max)
print("Actual max length:", actual_max)

#Create loaders
train_loader, val_loader, _ = prepare_datasets(cv = True, train_idx = np.arange(len(clean_sentences)))

#TUNE
start = datetime.now()
print("Tuning...", flush = True)
study = tune_parameters(50)
print("Total tuning time: %s\n" % (datetime.now() - start), flush = True)

best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

#TRAIN
criterion = nn.CrossEntropyLoss()
args = {"num_hidden_layers": best_params["num_hidden_layers"],
        "num_attention_heads": best_params["num_attention_heads"],
        "hidden_dropout_prob": best_params["dropout"],
        "attention_dropout_prob": best_params["dropout"],
        "hidden_act": "relu"}

model = get_plm(checkpoint, num_class, args).to(device)
model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
optimizer = optim.Adam(model.parameters(), lr = best_params["learning_rate"], weight_decay = best_params["weight_decay"])

print("=" * 20, "MODEL CONFIG", "=" * 20)
print(model.config)

train_model(epochs = best_params["num_epochs"])

torch.save(model, filePath_model) #Save model
#uploadFile(filePath_model, filePath_model.split("/")[-1])

##MentalBERT

In [None]:
print()
print("=" * 50)
print("=" * 20, "MentalBERT", "=" * 20)
print("=" * 50)
print()

#from huggingface_hub import notebook_login
#notebook_login()

#SETUP
checkpoint = "mental/mental-bert-base-uncased"
filePath_model = "_MODELS/%s.pt" % ( checkpoint.replace("/", "_"))

tokenizer = get_tokenizer("wp", checkpoint)

#Create input
actual_max = min(MAX_LENGTH, max([len(tokenizer.tokenize(x)) for x in clean_sentences]))
all_input = tokenizer(clean_sentences, padding = True, truncation = True, max_length = actual_max)
print("Actual max length:", actual_max)

#Create loaders
train_loader, val_loader, _ = prepare_datasets(cv = True, train_idx = np.arange(len(clean_sentences)))

#TUNE
start = datetime.now()
print("Tuning...", flush = True)
study = tune_parameters(50)
print("Total tuning time: %s\n" % (datetime.now() - start), flush = True)

best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

#TRAIN
criterion = nn.CrossEntropyLoss()
args = {"num_hidden_layers": best_params["num_hidden_layers"],
        "num_attention_heads": best_params["num_attention_heads"],
        "hidden_dropout_prob": best_params["dropout"],
        "attention_dropout_prob": best_params["dropout"],
        "hidden_act": "relu"}

model = get_plm(checkpoint, num_class, args).to(device)
model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
optimizer = optim.Adam(model.parameters(), lr = best_params["learning_rate"], weight_decay = best_params["weight_decay"])

print("=" * 20, "MODEL CONFIG", "=" * 20)
print(model.config)

train_model(epochs = best_params["num_epochs"])
torch.save(model, filePath_model) #Save model

##ClinicalBERT

In [None]:
print()
print("=" * 50)
print("=" * 20, "ClinicalBERT", "=" * 20)
print("=" * 50)
print()

#from huggingface_hub import notebook_login
#notebook_login()

#SETUP
checkpoint = "medicalai/ClinicalBERT"
filePath_model = "_MODELS/%s_%s.pt" % (datasetName, checkpoint.replace("/", "_"))

tokenizer = get_tokenizer("wp", checkpoint)

#Create input
actual_max = min(MAX_LENGTH, max([len(tokenizer.tokenize(x)) for x in clean_sentences]))
all_input = tokenizer(clean_sentences, padding = True, truncation = True, max_length = actual_max)
print("Actual max length:", actual_max)

#Create loaders
train_loader, val_loader, _ = prepare_datasets(cv = True, train_idx = np.arange(len(clean_sentences)))

#TUNE
start = datetime.now()
print("Tuning...", flush = True)
study = tune_parameters()
print("Total tuning time: %s\n" % (datetime.now() - start), flush = True)

best_trial = study.best_trial
best_params = best_trial.params
print("BEST:", best_trial.value)
print("Params:")
for key, value in best_params.items():
  print("    {}: {}".format(key, value))

#TRAIN
criterion = nn.CrossEntropyLoss()
args = {"num_hidden_layers": best_params["num_hidden_layers"],
        "num_attention_heads": best_params["num_attention_heads"],
        "dropout": best_params["dropout"],
        "attention_dropout": best_params["dropout"],
        "hidden_act": "relu"}

model = get_plm(checkpoint, num_class, args).to(device)
model.resize_token_embeddings(len(tokenizer))         #Resize vocab for added emojis and reserved tokens
optimizer = optim.Adam(model.parameters(), lr = best_params["learning_rate"], weight_decay = best_params["weight_decay"])

print("=" * 20, "MODEL CONFIG", "=" * 20)
print(model.config)

train_model(epochs = best_params["num_epochs"])
torch.save(model, filePath_model) #Save model