In [1]:
# Reference: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report

import torch
from torch import nn, optim

from collections import defaultdict
import seaborn as sns

In [3]:
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [16]:
pre_trained_model_selection = ['bert-base-uncased', 'distilbert-base-uncased', 
                               'albert-base-v2', 'roberta-base', 'xlnet-base-cased', 'gpt2']

PRE_TRAINED_MODEL_NAME = pre_trained_model_selection[4]
print(PRE_TRAINED_MODEL_NAME)

xlnet-base-cased


In [14]:
from transformers import XLNetTokenizer, XLNetModel

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

In [5]:
MAX_LEN = 100

BATCH_SIZE = 4
EPOCHS = 3

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

tokenizer.pad_token = tokenizer.eos_token

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
full_df = pd.read_csv('data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')
full_df = full_df.dropna() # dataset contains NaN values, dropping NaNs here

y_variables = ['Answer.1gamemove.yes_label', 'Answer.2reasoning.yes_label', 
               'Answer.3rapport.yes_label', 'Answer.4shareinformation.yes_label',
               'Input.deception_quadrant']

le = LabelEncoder()
full_df['Answer.1gamemove.yes_label'] = le.fit_transform(full_df['Answer.1gamemove.yes_label'])
full_df['Answer.2reasoning.yes_label'] = le.fit_transform(full_df['Answer.2reasoning.yes_label'])
full_df['Answer.3rapport.yes_label'] = le.fit_transform(full_df['Answer.3rapport.yes_label'])
full_df['Answer.4shareinformation.yes_label'] = le.fit_transform(full_df['Answer.4shareinformation.yes_label'])
full_df['Input.deception_quadrant'] = le.fit_transform(full_df['Input.deception_quadrant'])

df_train, df_test = train_test_split(full_df, test_size=0.2)
df_val, df_test = train_test_split(df_test, test_size=0.2)

In [7]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
    def __init__(self, full_texts, y_vars, tokenizer, max_len):
        self.full_texts = full_texts
        self.y_vars = y_vars
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, item):
        full_text = str(self.full_texts[item])
        y = self.y_vars[item]
        encoding = self.tokenizer.encode_plus(
          full_text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        return {
          'full_text': full_text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'y_vars': torch.tensor(y, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, y_var_name, max_len, batch_size):
    ds = Dataset(
        full_texts=df['Input.full_text'].to_numpy(),
        y_vars=df[y_var_name].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [8]:
class QuadrantClassifier(nn.Module):
    def __init__(self, n_classes):
        super(QuadrantClassifier, self).__init__()
        self.modelitem = model.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.modelitem.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.modelitem (
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [9]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        y_var = d["y_vars"].to(device)
        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, y_var)
        correct_predictions += torch.sum(preds == y_var)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [10]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            y_var = d["y_vars"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, y_var)
            correct_predictions += torch.sum(preds == y_var)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [11]:
def get_predictions(model, data_loader):
    model = model.eval()
    full_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    
    with torch.no_grad():
        for d in data_loader:
            texts = d["full_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            y_var = d["y_vars"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )
            
            _, preds = torch.max(outputs, dim=1)
            full_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(y_var)
            
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return full_texts, predictions, prediction_probs, real_values

In [12]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True')
    plt.xlabel('Predicted')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device: ', device)

for y_var in y_variables:
    print()
    print(y_var)
    
    # Create data splits
    train_data_loader = create_data_loader(df_train, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(df_val, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    
    # Sainty checks
    print('sanity checks')
    data = next(iter(train_data_loader))
    print(data.keys())
    print(data['input_ids'].shape)
    print(data['attention_mask'].shape)
    
    class_names = df_train[y_var].unique()
    
    # Create classifier model
    new_model = QuadrantClassifier(len(class_names))
    new_model = new_model.to(device)
    
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)

    # Add softmax final layer 
    nn.functional.softmax(new_model(input_ids, attention_mask), dim=1)

    # Compile model for number of epochs
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )
    loss_fn = nn.CrossEntropyLoss().to(device)
    
    # Do training
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        train_acc, train_loss = train_epoch(
            new_model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train)
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            new_model,
            val_data_loader,
            loss_fn,
            device,
            len(df_val)
          )

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc
    
    # Plot values 
    plt.plot(history['train_acc'], label='train accuracy')
    plt.plot(history['val_acc'], label='validation accuracy')
    plt.title('Training history')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.ylim([0, 1]);
    
    test_acc, _ = eval_model(
      new_model,
      test_data_loader,
      loss_fn,
      device,
      len(df_test)
    )
    print('Test accuracy: ', test_acc.item())
    
    # Get predictions
    y_full_texts, y_pred, y_pred_probs, y_test = get_predictions(
      new_model,
      test_data_loader
    )
    y_pred = y_pred.numpy()
    y_test = y_test.numpy()

    # Classification report and confusion matrix
    report = classification_report(y_test, y_pred, output_dict=True)
    print(pd.DataFrame(report).transpose())
    
    cm = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    show_confusion_matrix(df_cm)
    
    # Clear GPU memory
    del input_ids
    del attention_mask
    del new_model
    del loss_fn
    
    print(torch.cuda.empty_cache())

device:  cuda:0

Answer.1gamemove.yes_label
sanity checks
dict_keys(['full_text', 'input_ids', 'attention_mask', 'y_vars'])
torch.Size([4, 100])
torch.Size([4, 100])
