In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
from argparse import Namespace
import seaborn as sns
import random
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data as Data
from torch.utils.data.dataset import random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from sklearn.model_selection import cross_val_score, learning_curve, cross_val_predict, KFold, train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_fscore_support
from sklearn.metrics import f1_score

In [None]:
args = Namespace(
    variable = 'label', ##this is the label name(the label column name in .csv)
    batch_size = 16, #batch size, we always choose 16, 32, 64 depending on the GPU memory size
    epochs = 2, #epochs, 2, 3, or 4 epochs are always enough to train BERT
    max_length = 128, #sequence max length, the BERT limitation is 512, we always choose 2^n
    random_state = 42, #random_state
    test_size = 0.2, # test set size
    learning_rate = 5e-5, # learning rate during training BERT, we always choose 2e-5, 3e-5, 5e-5
#     cv_num = 5, # n-fold cross-validation

)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
df = pd.read_csv('round3/sideEffect_for_training.csv', encoding = 'unicode_escape')

In [None]:
#The following chunk is to under-sample the majority class by randomly selecting samples, 
#and make the dataset balanced.
def undersample(dataframe, binary):
    countdict = dataframe[binary].value_counts().to_dict()
    count0, count1 = countdict[0], countdict[1]
    class1 = dataframe[dataframe[binary] == 1]
    class0 = dataframe[dataframe[binary] == 0]
    if count1 > count0:
        class1_under = class1.sample(count0, random_state = args.random_state) #random_state
        newalldf = pd.concat([class1_under, class0], axis = 0, sort = False).reset_index(drop = True)
    else:
        class0_under = class0.sample(count1, random_state = args.random_state)  #random_state
        newalldf = pd.concat([class0_under, class1], axis = 0, sort = False).reset_index(drop = True)
    return newalldf

In [None]:
underdf = undersample(dfpre, args.variable)
print(underdf[args.variable].value_counts())

In [None]:
# split dataset into train, and test
negdf = underdf[underdf[args.variable] == 0].reset_index(drop = True)
posdf = underdf[underdf[args.variable] == 1].reset_index(drop = True)
def split_train_test(posdf, negdf):
    postrain, postest = train_test_split(posdf, test_size = args.test_size, shuffle = True, random_state = args.random_state)
    negtrain, negtest = train_test_split(negdf, test_size = args.test_size, shuffle = True, random_state = args.random_state)
    testdf = pd.concat([postest, negtest], axis = 0, sort = False).reset_index(drop = True)
    traindf = pd.concat([postrain, negtrain], axis = 0, sort = False).reset_index(drop = True)
    return traindf, testdf

train, test = split_train_test(posdf, negdf)
print(train[args.variable].value_counts())
print(test[args.variable].value_counts())

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def seed_torch(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
def _init_fn(seed = 42):
    np.random.seed(int(seed))

In [None]:
###training
def trainbert(sentences, labels):
    training_input_ids = []
    training_attention_masks = []
    for sent in sentences:
        training_encoded_dict = tokenizer.encode_plus(
                            str(sent),
                            add_spicial_tokens = True,
                            max_length = args.max_length, #512
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',

        )
        training_input_ids.append(training_encoded_dict['input_ids'])
        training_attention_masks.append(training_encoded_dict['attention_mask'])
    training_input_ids = torch.cat(training_input_ids, axis = 0)
    training_attention_masks = torch.cat(training_attention_masks, axis = 0)
    training_labels = torch.from_numpy(labels).long()

    train_dataset = Data.TensorDataset(training_input_ids, training_attention_masks, training_labels)

    train_dataloader = Data.DataLoader(
                train_dataset,
                sampler = Data.RandomSampler(train_dataset),
                batch_size = args.batch_size,
                worker_init_fn = _init_fn
    )

    total_steps = len(train_dataloader) * args.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps                                            
                                           )

    for i in range(0, args.epochs):
        print('Epoch {:} / {:}'.format(i + 1, args.epochs))
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            loss, logits = model(b_input_ids,
                                 token_type_ids = None,
                                 attention_mask = b_input_mask,
                                 labels = b_labels                            
                                )
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)
        print("average training loss: {0:.2f}".format(avg_train_loss))

    return avg_train_loss

In [None]:
####test
def testacc(testdf):
    testsen = testdf['tweet'].values
    testlabels = testdf[args.variable].values
    testinput_ids = []
    testattention_masks = []
    for sent in testsen:
        encoded_dict = tokenizer.encode_plus(
                          sent,
                          add_special_tokens = True,
                          max_length = args.max_length,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt'
      )
        testinput_ids.append(encoded_dict['input_ids'])
        testattention_masks.append(encoded_dict['attention_mask'])
    testinput_ids = torch.cat(testinput_ids, dim = 0)
    testattention_masks = torch.cat(testattention_masks, dim = 0)
    testlabels = torch.tensor(testlabels).long()
    testdata = Data.TensorDataset(testinput_ids, testattention_masks, testlabels)
    testdataloader = Data.DataLoader(
                testdata,
                sampler = Data.SequentialSampler(testdata),
                batch_size = args.batch_size,
                worker_init_fn = _init_fn
    )
    
    model.eval()
    all_logits = []
    
    test_acc = 0
    test_loss = 0
    test_steps = 0
    for batch in testdataloader:
        test_input_ids = batch[0].to(device)
        test_input_mask = batch[1].to(device)
        test_labels = batch[2].to(device)
        with torch.no_grad():
            (loss, logits) = model(
                               test_input_ids,
                               token_type_ids = None,
                               attention_mask = test_input_mask,
                               labels = test_labels

            )
        all_logits.append(logits)
        test_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        testlabel_ids = test_labels.detach().cpu().numpy()
        test_acc += flat_accuracy(logits, testlabel_ids)
    
    all_logits = torch.cat(all_logits, dim = 0)
    probs = F.softmax(all_logits, dim = 1).cpu().numpy()
    avg_testacc = test_acc / len(testdataloader)
    return avg_testacc, probs

In [None]:
#training
seed_torch()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True) 

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2 #num_labels = 2, binary classification
)
model.to(device)


optimizer = torch.optim.Adam(model.parameters(),
                lr = args.learning_rate
)

trainloss = trainbert(train['tweet'].values, train[args.variable].values) 

In [None]:
def evaluate_roc(probs, y_true):
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    
    print('acc on test set: {0:.2f}'.format(accuracy))
    print('auc: {0:.2f}'.format(roc_auc))
    return fpr, tpr, roc_auc, accuracy, y_pred

In [None]:
def plot_roc(fpr, tpr, roc_auc):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc='upper left')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
#     plt.savefig('rocprovax.png')
    plt.show()

In [None]:
def compute_metrics(labels, preds):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': round(acc, 2),
        'f1': round(f1, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2)
    }

In [None]:
#get test result
testacc, probs = testacc(test)

In [None]:
#get test result, such as accuracy, precision, recall, f-1
fpr, tpr, roc_auc, accuracy, y_pred = evaluate_roc(probs, test[args.variable].values)

In [None]:
compute_metrics(test[args.variable].values, y_pred)

In [None]:
#plot ROC curve
plot_roc(fpr, tpr, roc_auc)

In [None]:
##save trained model
output_dir = 'models/round3/finetune_bert_liberty_under/'
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

The following chunk is to train BERT with 5-fold cross-validation.

In [None]:
# def trainbert(sentences, labels):
#     splits = list(KFold(n_splits = args.cv_num, shuffle = True, random_state = args.random_state).split(sentences, labels)) #, random_state = 42
#     avg_train_losses = []
#     avg_val_losses = []
#     avg_val_acc = []
    
#     whole_train_stats = []
#     for i, (train_idx, valid_idx) in enumerate(splits):
#         seed_torch()

#         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

#         model = BertForSequenceClassification.from_pretrained(
#             'bert-base-uncased',

#             num_labels = 2
#         )
#         model.to(device)


#         optimizer = AdamW(model.parameters(),
#                           lr = 2e-5,
#                           eps = 1e-8
#         )
        
#         training_input_ids = []
#         training_attention_masks = []
#         for sent in sentences[train_idx]:
#             training_encoded_dict = tokenizer.encode_plus(
#                                 sent,
#                                 add_spicial_tokens = True,
#                                 max_length = args.max_length, #512
#                                 pad_to_max_length = True,
#                                 return_attention_mask = True,
#                                 return_tensors = 'pt',

#             )
#             training_input_ids.append(training_encoded_dict['input_ids'])
#             training_attention_masks.append(training_encoded_dict['attention_mask'])
#         training_input_ids = torch.cat(training_input_ids, axis = 0)
#         training_attention_masks = torch.cat(training_attention_masks, axis = 0)
#         training_labels = torch.from_numpy(labels[train_idx])


#         valid_input_ids = []
#         valid_attention_masks = []
#         for sent in sentences[valid_idx]:
#             valid_encoded_dict = tokenizer.encode_plus(
#                                 sent,
#                                 add_spicial_tokens = True,
#                                 max_length = args.max_length, #512
#                                 pad_to_max_length = True,
#                                 return_attention_mask = True,
#                                 return_tensors = 'pt',

#             )
#             valid_input_ids.append(valid_encoded_dict['input_ids'])
#             valid_attention_masks.append(valid_encoded_dict['attention_mask'])
#         valid_input_ids = torch.cat(valid_input_ids, axis = 0)
#         valid_attention_masks = torch.cat(valid_attention_masks, axis = 0)
#         valid_labels = torch.from_numpy(labels[valid_idx])

#         train_dataset = Data.TensorDataset(training_input_ids, training_attention_masks, training_labels)
#         valid_dataset = Data.TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)


#         train_dataloader = Data.DataLoader(
#                     train_dataset,
#                     sampler = Data.RandomSampler(train_dataset),
#                     batch_size = args.batch_size,
#                     worker_init_fn = _init_fn
#         )

#         validation_dataloader = Data.DataLoader(
#                     valid_dataset,
#                     sampler = Data.SequentialSampler(valid_dataset),
#                     batch_size = args.batch_size, 
#                     worker_init_fn = _init_fn
#         )




#         total_steps = len(train_dataloader) * args.epochs
#         scheduler = get_linear_schedule_with_warmup(optimizer,
#                                                 num_warmup_steps = 0,
#                                                 num_training_steps = total_steps                                            
#                                                )

#         print(f'Fold{i + 1}')

#         training_stats = []
#         for i in range(0, args.epochs):
#             print('Epoch {:} / {:}'.format(i + 1, args.epochs))
#             total_train_loss = 0
#             model.train()
#             for step, batch in enumerate(train_dataloader):
#                 b_input_ids = batch[0].to(device)
#                 b_input_mask = batch[1].to(device)
#                 b_labels = batch[2].to(device)

#                 model.zero_grad()

#                 loss, logits = model(b_input_ids,
#                                      token_type_ids = None,
#                                      attention_mask = b_input_mask,
#                                      labels = b_labels                            
#                                     )
#                 total_train_loss += loss.item()
#                 loss.backward()
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#                 optimizer.step()
#                 scheduler.step()
#             avg_train_loss = total_train_loss / len(train_dataloader)
#             print("average training loss: {0:.2f}".format(avg_train_loss))

#             print("running validation")
#             model.eval()
            
#             total_eval_accuracy = 0
#             total_eval_loss = 0
#             nb_eval_steps = 0
#             for batch in validation_dataloader:
#                 val_input_ids = batch[0].to(device)
#                 val_input_mask = batch[1].to(device)
#                 val_labels = batch[2].to(device)
#                 with torch.no_grad():
#                     (valloss, vallogits) = model(val_input_ids,
#                                            token_type_ids = None,
#                                            attention_mask = val_input_mask,
#                                            labels = val_labels                              
#                                           )
#                 total_eval_loss += valloss.item()
#                 vallogits = vallogits.detach().cpu().numpy()
#                 label_ids = val_labels.to('cpu').numpy()
#                 total_eval_accuracy += flat_accuracy(vallogits, label_ids)
            
#             avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
#             print("accuracy: {0:.2f}".format(avg_val_accuracy))

#             avg_val_loss = total_eval_loss / len(validation_dataloader)
#             print("average val loss: {0:.2f}".format(avg_val_loss))
            
#             avg_train_losses.append(avg_train_loss)
#             avg_val_losses.append(avg_val_loss)
#             avg_val_acc.append(avg_val_accuracy)
            

#             training_stats.append(
#             {
#                 'epoch': i + 1,
#                 'training loss': avg_train_loss,
#                 'valid loss': avg_val_loss,
#                 'valid acc': avg_val_accuracy
#             }
#             )
            
        
#         whole_train_stats.append(training_stats)

#     print('All \t train_loss={:.2f} \t val_loss={:.2f} \t val_acc={:.2f}'.format(np.mean(avg_train_losses), np.mean(avg_val_losses), np.mean(avg_val_acc)))
#     return whole_train_stats, avg_train_losses, avg_val_losses, avg_val_acc

The following is to further split the training set into trainining and validation (20%). And then train and validate the model.

In [None]:
# def trainbert(sentences, labels):
#     xtrain, xtest, ytrain, ytest = train_test_split(sentences, labels, test_size=0.2, shuffle = True,
#                                                         random_state=42)
#     avg_train_losses = []
#     avg_val_losses = []
#     avg_val_acc = []
#     training_input_ids = []
#     training_attention_masks = []
#     training_stats = []
#     for sent in xtrain:
#         training_encoded_dict = tokenizer.encode_plus(
#                             sent,
#                             add_spicial_tokens = True,
#                             max_length = args.max_length, #512
#                             pad_to_max_length = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt',

#         )
#         training_input_ids.append(training_encoded_dict['input_ids'])
#         training_attention_masks.append(training_encoded_dict['attention_mask'])
#     training_input_ids = torch.cat(training_input_ids, axis = 0)
#     training_attention_masks = torch.cat(training_attention_masks, axis = 0)
#     training_labels = torch.from_numpy(ytrain).long()

#     train_dataset = Data.TensorDataset(training_input_ids, training_attention_masks, training_labels)

#     train_dataloader = Data.DataLoader(
#                 train_dataset,
#                 sampler = Data.RandomSampler(train_dataset),
#                 batch_size = args.batch_size,
#                 worker_init_fn = _init_fn
#     )
    
#     valid_input_ids = []
#     valid_attention_masks = []
#     for valsent in xtest:
#         val_encoded_dict = tokenizer.encode_plus(
#                             valsent,
#                             add_spicial_tokens = True,
#                             max_length = args.max_length, #512
#                             pad_to_max_length = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt',

#         )
#         valid_input_ids.append(val_encoded_dict['input_ids'])
#         valid_attention_masks.append(val_encoded_dict['attention_mask'])
#     valid_input_ids = torch.cat(valid_input_ids, axis = 0)
#     valid_attention_masks = torch.cat(valid_attention_masks, axis = 0)
#     val_labels = torch.from_numpy(ytest).long()

#     val_dataset = Data.TensorDataset(valid_input_ids, valid_attention_masks, val_labels)

#     val_dataloader = Data.DataLoader(
#                 val_dataset,
#                 sampler = Data.RandomSampler(val_dataset),
#                 batch_size = args.batch_size,
#                 worker_init_fn = _init_fn
#     )

#     total_steps = len(train_dataloader) * args.epochs
#     scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps                                            
#                                            )

#     for i in range(0, args.epochs):
#         print('Epoch {:} / {:}'.format(i + 1, args.epochs))
#         total_train_loss = 0
#         model.train()
#         for step, batch in enumerate(train_dataloader):
#             b_input_ids = batch[0].to(device)
#             b_input_mask = batch[1].to(device)
#             b_labels = batch[2].to(device)
#             model.zero_grad()
#             loss, logits = model(b_input_ids,
#                                  token_type_ids = None,
#                                  attention_mask = b_input_mask,
#                                  labels = b_labels                            
#                                 )
#             total_train_loss += loss.item()
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             scheduler.step()
#         avg_train_loss = total_train_loss / len(train_dataloader)
#         print("average training loss: {0:.2f}".format(avg_train_loss))
        
#         print("running validation")
#         model.eval()
#         total_eval_accuracy = 0
#         total_eval_loss = 0
#         nb_eval_steps = 0
#         for batch in val_dataloader:
#             val_input_ids = batch[0].to(device)
#             val_input_mask = batch[1].to(device)
#             val_labels = batch[2].to(device)
#             with torch.no_grad():
#                 (valloss, vallogits) = model(val_input_ids,
#                                             token_type_ids = None,
#                                             attention_mask = val_input_mask,
#                                             labels = val_labels                              
#                                         )
#             total_eval_loss += valloss.item()
#             vallogits = vallogits.detach().cpu().numpy()
#             label_ids = val_labels.to('cpu').numpy()
#             total_eval_accuracy += flat_accuracy(vallogits, label_ids)
            
#         avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
#         print("accuracy: {0:.2f}".format(avg_val_accuracy))

#         avg_val_loss = total_eval_loss / len(val_dataloader)
#         print("average val loss: {0:.2f}".format(avg_val_loss))
            
#         avg_train_losses.append(avg_train_loss)
#         avg_val_losses.append(avg_val_loss)
#         avg_val_acc.append(avg_val_accuracy)


#         training_stats.append(
#         {
#             'epoch': i + 1,
#             'training loss': avg_train_loss,
#             'valid loss': avg_val_loss,
#             'valid acc': avg_val_accuracy
#         }
#         )
            
#     print('All \t train_loss={:.2f} \t val_loss={:.2f} \t val_acc={:.2f}'.format(np.mean(avg_train_losses), np.mean(avg_val_losses), np.mean(avg_val_acc)))
# #     return whole_train_stats, avg_train_losses, avg_val_losses, avg_val_acc

#     return avg_train_loss, training_stats

In [None]:
###plot the learning curve, only can be plot if trained with cross-validation or validation set
# pd.set_option('precision', 2)

# # Create a DataFrame from our training statistics.
# df_stats = pd.DataFrame(data=training_stats)

# # Use the 'epoch' as the row index.
# df_stats = df_stats.set_index('epoch')

# # A hack to force the column headers to wrap.
# #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# # Display the table.
# df_stats

# sns.set(style='darkgrid')

# # # Increase the plot size and font size.
# # sns.set(font_scale=1.5)
# # plt.rcParams["figure.figsize"] = (12,6)

# # Plot the learning curve.
# plt.plot(df_stats['training loss'], 'b-o', label="Training")
# plt.plot(df_stats['valid loss'], 'g-o', label="Validation")

# # Label the plot.
# plt.title("Training & Validation Loss")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.legend()
# plt.xticks([1, 2])

# plt.show()