# Import definitions

In [2]:
!pip3 install --upgrade pip torch torchvision torchaudio pandas numpy sklearn transformers ipywidgets matplotlib seaborn

Collecting torchvision
  Downloading torchvision-0.15.2-cp311-cp311-macosx_11_0_arm64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.0.2-cp311-cp311-macosx_11_0_arm64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/e5/cd/c941b51e95992968e3e8abc7180f33b952478abd6943062051517a808db7/pandas-2.1.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading pandas-2.1.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting sklearn
  Using cached sklearn-0.0.post7.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython 

In [3]:
import torch
import pandas as pd
import numpy as np

from preprocess_indexqual import load_html_transcripts
from preprocess_sentiment import preprocess_sent
from preprocess_sentiment_test import preprocess_sentiment_test
from torch.nn import BCEWithLogitsLoss

In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [6]:
from torch.optim import AdamW

In [7]:
from tqdm import tqdm, trange

In [8]:
import os
import gc
import shutil

# use CUDA when available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using:", device)

# clean
torch.cuda.empty_cache()
gc.collect()

# define max length
max_length = 512

def model_init():
    torch.cuda.empty_cache()
    gc.collect()
    model = RobertaForSequenceClassification.from_pretrained('pdelobelle/robbert-v2-dutch-base', num_labels=len(classes))
    return model

using: cpu


# Preprocessing

In [9]:
# print start info
print("starting to train BERT with sentiment data")
print("preprocessing ...")

# load preprocessed data
df = preprocess_sent()

# to csv
df.to_csv('preprocessed_sentiment.csv', index = False, header=True)
    
# show data
print(df.head())
    
# select label columns
cols = df.columns
label_cols = list(cols[1:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
classes = label_cols

# set header for all label columns
df['labels'] = list(df[label_cols].values)
df.head()
print(len(df))

# get input and outputs
labels = list(df.labels.values)
sentences = list(df.text.values)

starting to train BERT with sentiment data
preprocessing ...
                                                text  negative  neutral   
0  Weinig activiteitenbegeleiding voor zo een men...         1        0  \
1                                   Nee dat is mooi.         0        0   
2  Je daar heeft ze het toch ook wel ze spreekt d...         1        0   
3                             Dat moet je niet doen.         1        0   
4                             Begon ze te vertellen.         0        0   

   positive  
0         0  
1         1  
2         0  
3         0  
4         1  
Label columns:  ['negative', 'neutral', 'positive']
5861


In [10]:
# tokenize data
tokenizer = RobertaTokenizer.from_pretrained('pdelobelle/robbert-v2-dutch-base')  # tokenizer
encodings = tokenizer.batch_encode_plus(sentences, truncation=True,
                                    max_length=max_length,
                                    padding=True)
print('tokenizer outputs: ', encodings.keys())

# preparing data format for training
input_ids = encodings['input_ids']  # tokenized and encoded sentences
attention_masks = encodings['attention_mask']  # attention 

# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(
    input_ids, labels, attention_masks, random_state=2020, test_size=0.10, stratify = labels)

tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [11]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_inputs)
validation_masks = torch.tensor(validation_masks)
validation_labels = torch.tensor(validation_labels)

  train_labels = torch.tensor(train_labels)


In [12]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48,
# or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because,
# unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_labels)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_labels)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Training

In [13]:
# Init model
# TODO add seed constant
model = model_init()
model.to(device)

# Store our loss and accuracy for plotting
train_loss_set = []
train_loss_per_epoch = []
valid_loss_per_epoch = []
valid_acc_set = []
best_valid_f1 = 0
best_name = ""

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def save_model():
    best_name = 'bert_model_sentiment_' + str(val_f1_accuracy)
    dic = zip(range(0, len(classes)), classes)
    torch.save(model.state_dict(), best_name)
    
    return best_name

In [15]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0  # running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    with tqdm(total=len(train_dataloader), position=0, leave=True) as pbar:
        for step, batch in enumerate(tqdm(train_dataloader, position=0, leave=True)):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()

            # Forward pass for multilabel classification
            logits = model(b_input_ids, b_input_mask)
            loss_func = BCEWithLogitsLoss()

            loss = loss_func(logits[0].view(-1, num_labels),
                             b_labels.type_as(logits[0]).view(-1, num_labels))  # convert labels to float for calculation
            train_loss_set.append(loss.item())

            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # scheduler.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            pbar.update()

    print("Train loss: {}".format(tr_loss / nb_tr_steps))
    train_loss_per_epoch.append(tr_loss / nb_tr_steps)
    
    ###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Variables to gather full output
    logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

    # Tracking variables
    vd_loss = 0  # running loss
    nb_vd_steps = 0
    
    # Predict
    with tqdm(total=len(validation_dataloader), position=0, leave=True) as pbar:
        for i, batch in enumerate(tqdm(validation_dataloader, position=0, leave=True)):
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                # Forward pass
                b_logit_pred = model(b_input_ids, b_input_mask)

                loss = loss_func(b_logit_pred[0].view(-1, num_labels),
                     b_labels.type_as(b_logit_pred[0]).view(-1, num_labels))  # convert labels to float for calculation
                vd_loss += loss.item()
                nb_vd_steps += 1

                pred_label = torch.sigmoid(b_logit_pred[0])
                b_logit_pred = b_logit_pred[0].detach().cpu().numpy()
                pred_label = pred_label.to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()

            tokenized_texts.append(b_input_ids)
            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)
            
            pbar.update()
    
    print("Validation loss: {}".format(vd_loss / nb_vd_steps))
    valid_loss_per_epoch.append(vd_loss / nb_vd_steps)
        
    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    
    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl > threshold for pl in pred_labels]
    true_bools = [tl == 1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

    valid_acc_set.append(val_f1_accuracy)
    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)   
    
    if val_f1_accuracy > best_valid_f1:
        if os.path.exists(best_name) and os.path.isdir(best_name):
            shutil.rmtree(best_name)
        best_valid_f1 = val_f1_accuracy
        best_name = save_model()

100%|██████████| 165/165 [05:51<00:00,  2.13s/it]
100%|██████████| 165/165 [05:51<00:00,  2.13s/it]


Train loss: 0.5554554628603386


100%|██████████| 19/19 [00:13<00:00,  1.41it/s]
100%|██████████| 19/19 [00:13<00:00,  1.41it/s]


Validation loss: 0.5042711543409448
F1 Validation Accuracy:  55.47445255474454
Flat Validation Accuracy:  45.31516183986371


  6%|▌         | 10/165 [00:22<05:46,  2.23s/it]s/it]
  6%|▌         | 10/165 [00:22<05:46,  2.23s/it]
Epoch:  50%|█████     | 1/2 [06:28<06:28, 388.06s/it]


KeyboardInterrupt: 

# Simple predictions

In [None]:
# add samples

# Confusion matrix

In [1]:
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

codes, test_df = preprocess_sentiment_test()

# select label columns
cols = test_df.columns
label_cols = list(cols[1:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
classes = label_cols

# set header for all label columns
test_df['labels'] = list(test_df[label_cols].values)

# Gathering input data
test_labels = list(test_df.labels.values)
test_comments = list(test_df.sentence.values)


# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']

# Make tensors out of data
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_labels)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Variables to gather full output
logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

# Use original distribution for evaluation (instead of a balanced distribution)
# validation_sampler = SequentialSampler(validation_labels)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Predict
for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        # Forward pass
        b_logit_pred = model(b_input_ids, b_input_mask)
        pred_label = torch.sigmoid(b_logit_pred[0])

        b_logit_pred = b_logit_pred[0].detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
true_labels = [item for sublist in true_labels for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]

# Calculate Accuracy
threshold = 0.50
pred_bools = [pl > threshold for pl in pred_labels]
true_bools = [tl == 1 for tl in true_labels]
val_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

print('F1 Validation Accuracy: ', val_f1_accuracy)
print('Flat Validation Accuracy: ', val_flat_accuracy)   

# calculate predicted class for single-label CFM
true_labels_single = np.argmax(true_labels, axis=1)
pred_labels_single = np.argmax(pred_labels, axis=1)

cm = confusion_matrix(true_labels_single, pred_labels_single)
cm_rot = np.fliplr(np.rot90(cm))

cm_rot_df = pd.DataFrame(cm_rot, index=classes[::-1], columns=classes[::-1])
cm_rot_df.index.name = 'As predicted by text mining'
cm_rot_df.columns.name = 'Manually coded'

print(cm_rot_df)

NameError: name 'preprocess_sentiment_test' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

figsize = (8, 8)

fig = plt.figure(figsize=figsize)

# plt.subplot(1, 2, 1)
# plt.plot(train_loss_per_epoch, 'g')
# plt.plot(valid_loss_per_epoch, 'b')
# plt.grid(True)

plt.subplot(1, 1, 1)
sns.heatmap(cm_rot_df, annot=cm_rot, fmt='', cmap="Blues", annot_kws={"size": 20}, cbar=False)
plt.show()

In [None]:
import json

# load transcripts
_, transcripts, filenames = load_html_transcripts()

for j in range(0, len(transcripts)):
    
    if '.html' not in filenames[j]:
        continue
    
    context_text = ''

    obj_list = []
    
    lines = transcripts[j]
    
    for i in range(0, len(lines)):
        sentence = tokenizer(lines[i], truncation=True,
                                        max_length=max_length,
                                        padding=True, return_tensors='pt')

        sentence.to(device)

        result = model(sentence.input_ids, sentence.attention_mask)

        pred_labels = result.logits.detach().cpu().numpy()
        pred_labels_single = np.argmax(pred_labels, axis=1)
        
        obj = {
            "text": lines[i],
            "label": int(pred_labels_single[0])
        }

        obj_list.append(obj)

        
    pos = [segment for segment in obj_list if segment['label'] == 2]
    neg = [segment for segment in obj_list if segment['label'] == 0]
    pos_len = len(pos)
    neg_len = len(neg)
        
    print(filenames[j], ":", str(round(pos_len / (pos_len + neg_len) * 100, 1)))
        
    obj = {
        "filename": filenames[j],
        "segments": obj_list
    }
            
    json_dump = json.dumps(obj)

    with open("output/" + str(filenames[j]) + ".json", "w") as f:
        f.write(json_dump)

print("Done!")