Reference: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report

from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim

from collections import defaultdict
import seaborn as sns

In [2]:
full_df = pd.read_csv('data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')
full_df = full_df.dropna() # dataset contains NaN values, dropping NaNs here

y_variables = ['Answer.1gamemove.yes_label', 'Answer.2reasoning.yes_label', 
               'Answer.3rapport.yes_label', 'Answer.4shareinformation.yes_label',
               'Input.deception_quadrant']

In [3]:
pre_trained_model_selection = ['bert-base-uncased', 'distilbert-base-uncased', 
                               'albert-base-v2', 'roberta-base']

PRE_TRAINED_MODEL_NAME = pre_trained_model_selection[3]
print(PRE_TRAINED_MODEL_NAME)

xlnet-base-cased


In [5]:
MAX_LEN = 100

BATCH_SIZE = 4
EPOCHS = 3

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

tokenizer.pad_token = tokenizer.eos_token

In [2]:
le = LabelEncoder()
full_df['Answer.1gamemove.yes_label'] = le.fit_transform(full_df['Answer.1gamemove.yes_label'])
full_df['Answer.2reasoning.yes_label'] = le.fit_transform(full_df['Answer.2reasoning.yes_label'])
full_df['Answer.3rapport.yes_label'] = le.fit_transform(full_df['Answer.3rapport.yes_label'])
full_df['Answer.4shareinformation.yes_label'] = le.fit_transform(full_df['Answer.4shareinformation.yes_label'])
full_df['Input.deception_quadrant'] = le.fit_transform(full_df['Input.deception_quadrant'])

df_train, df_test = train_test_split(full_df, test_size=0.2)
df_val, df_test = train_test_split(df_test, test_size=0.2)

In [7]:
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
    def __init__(self, full_texts, y_vars, tokenizer, max_len):
        self.full_texts = full_texts
        self.y_vars = y_vars
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, item):
        full_text = str(self.full_texts[item])
        y = self.y_vars[item]
        encoding = self.tokenizer.encode_plus(
          full_text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        return {
          'full_text': full_text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'y_vars': torch.tensor(y, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, y_var_name, max_len, batch_size):
    ds = Dataset(
        full_texts=df['Input.full_text'].to_numpy(),
        y_vars=df[y_var_name].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [8]:
class QuadrantClassifier(nn.Module):
    def __init__(self, n_classes):
        super(QuadrantClassifier, self).__init__()
        self.modelitem = model.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.modelitem.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.modelitem (
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [10]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        y_var = d["y_vars"].to(device)
        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, y_var)
        correct_predictions += torch.sum(preds == y_var)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [11]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            y_var = d["y_vars"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, y_var)
            correct_predictions += torch.sum(preds == y_var)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [12]:
def get_predictions(model, data_loader):
    model = model.eval()
    full_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    
    with torch.no_grad():
        for d in data_loader:
            texts = d["full_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            y_var = d["y_vars"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )
            
            _, preds = torch.max(outputs, dim=1)
            full_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(y_var)
            
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return full_texts, predictions, prediction_probs, real_values

In [13]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True')
    plt.xlabel('Predicted')

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device: ', device)

for y_var in y_variables:
    print()
    print(y_var)
    
    # Create data splits
    train_data_loader = create_data_loader(df_train, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(df_val, tokenizer, y_var, MAX_LEN, BATCH_SIZE)
    
    # Sainty checks
    print('sanity checks')
    data = next(iter(train_data_loader))
    print(data.keys())
    print(data['input_ids'].shape)
    print(data['attention_mask'].shape)
    
    class_names = df_train[y_var].unique()
    
    # Create classifier model
    new_model = QuadrantClassifier(len(class_names))
    new_model = new_model.to(device)
    
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)

    # Add softmax final layer 
    nn.functional.softmax(new_model(input_ids, attention_mask), dim=1)

    # Compile model for number of epochs
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )
    loss_fn = nn.CrossEntropyLoss().to(device)
    
    # Do training
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        train_acc, train_loss = train_epoch(
            new_model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(df_train)
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            new_model,
            val_data_loader,
            loss_fn,
            device,
            len(df_val)
          )

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc
    
    # Plot values 
    plt.plot(history['train_acc'], label='train accuracy')
    plt.plot(history['val_acc'], label='validation accuracy')
    plt.title('Training history')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.ylim([0, 1]);
    
    test_acc, _ = eval_model(
      new_model,
      test_data_loader,
      loss_fn,
      device,
      len(df_test)
    )
    print('Test accuracy: ', test_acc.item())
    
    # Get predictions
    y_full_texts, y_pred, y_pred_probs, y_test = get_predictions(
      new_model,
      test_data_loader
    )
    y_pred = y_pred.numpy()
    y_test = y_test.numpy()

    # Classification report and confusion matrix
    report = classification_report(y_test, y_pred, output_dict=True)
    print(pd.DataFrame(report).transpose())
    
#     cm = confusion_matrix(y_test, y_pred)
#     df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
#     show_confusion_matrix(df_cm)
    
    # Clear GPU memory
    del input_ids
    del attention_mask
    del new_model
    del loss_fn
    
    print(torch.cuda.empty_cache())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


device:  cuda:0

Answer.1gamemove.yes_label
sanity checks
dict_keys(['full_text', 'input_ids', 'attention_mask', 'y_vars'])
torch.Size([4, 100])
torch.Size([4, 100])




TypeError: super(type, obj): obj must be an instance or subtype of type

### TRYING XLNET

In [2]:
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm, trange

epochs = 4

In [3]:
full_df = pd.read_csv('data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')
full_df = full_df.dropna() # dataset contains NaN values, dropping NaNs here

y_variables = ['Answer.1gamemove.yes_label', 'Answer.2reasoning.yes_label', 
               'Answer.3rapport.yes_label', 'Answer.4shareinformation.yes_label',
               'Input.deception_quadrant']

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 3090'

In [5]:
le = LabelEncoder()
full_df['Answer.1gamemove.yes_label'] = le.fit_transform(full_df['Answer.1gamemove.yes_label'])
full_df['Answer.2reasoning.yes_label'] = le.fit_transform(full_df['Answer.2reasoning.yes_label'])
full_df['Answer.3rapport.yes_label'] = le.fit_transform(full_df['Answer.3rapport.yes_label'])
full_df['Answer.4shareinformation.yes_label'] = le.fit_transform(full_df['Answer.4shareinformation.yes_label'])
full_df['Input.deception_quadrant'] = le.fit_transform(full_df['Input.deception_quadrant'])

labels = full_df['Input.deception_quadrant'].values

In [6]:
sentences = full_df['Input.full_text'].values

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [7]:
MAX_LEN = 100
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [8]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [10]:
for y_var in y_variables:
    print()
    print(y_var)
    
    model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=56, test_size=0.2)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=56, test_size=0.2)
    
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    batch_size = 32

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
    
    train_loss_set = []
    for _ in trange(epochs, desc="Epoch"):  
        model.train()

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            b_input_ids = b_input_ids.type(torch.LongTensor)
            b_input_mask = b_input_mask.type(torch.LongTensor)
            b_labels = b_labels.type(torch.LongTensor)
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            optimizer.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            train_loss_set.append(loss.item())    
            loss.backward()
            optimizer.step()


        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))
        
    model = model.eval()
    full_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for i, batch in enumerate(validation_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            b_input_ids = b_input_ids.type(torch.LongTensor)
            b_input_mask = b_input_mask.type(torch.LongTensor)
            b_labels = b_labels.type(torch.LongTensor)
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)
            
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            preds = torch.argmax(outputs[0],dim=1)

            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(b_labels)

    predictions = torch.stack(predictions).cpu()
    # prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    y_pred = predictions.numpy()
    y_test = real_values.numpy()

    report = classification_report(y_test, y_pred, output_dict=True)
    print(pd.DataFrame(report).transpose())
    
    del b_input_ids
    del b_input_mask
    del b_labels
    del model


Answer.1gamemove.yes_label


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Train loss: 0.03820500522851944


Epoch:  50%|██████████████████████████████████████▌                                      | 2/4 [01:16<01:16, 38.16s/it]

Train loss: 0.05567534267902374


Epoch:  75%|█████████████████████████████████████████████████████████▊                   | 3/4 [01:55<00:38, 38.57s/it]

Train loss: 0.7819858193397522


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [02:33<00:00, 38.40s/it]

Train loss: 0.5832027196884155



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000

Answer.2reasoning.yes_label


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Train loss: 0.08188541978597641


Epoch:  50%|██████████████████████████████████████▌                                      | 2/4 [01:15<01:15, 37.87s/it]

Train loss: 1.8367595672607422


Epoch:  75%|█████████████████████████████████████████████████████████▊                   | 3/4 [01:53<00:37, 37.82s/it]

Train loss: 0.06808080524206161


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [02:31<00:00, 37.80s/it]

Train loss: 0.0577768050134182



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000

Answer.3rapport.yes_label


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Train loss: 0.04303280636668205


Epoch:  50%|██████████████████████████████████████▌                                      | 2/4 [01:15<01:15, 37.74s/it]

Train loss: 0.04848571866750717


Epoch:  75%|█████████████████████████████████████████████████████████▊                   | 3/4 [01:53<00:37, 37.74s/it]

Train loss: 0.7880520224571228


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [02:30<00:00, 37.75s/it]

Train loss: 0.06522811949253082



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000

Answer.4shareinformation.yes_label


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Train loss: 0.050744131207466125


Epoch:  50%|██████████████████████████████████████▌                                      | 2/4 [01:15<01:15, 37.76s/it]

Train loss: 0.06017736718058586


Epoch:  75%|█████████████████████████████████████████████████████████▊                   | 3/4 [01:53<00:37, 37.75s/it]

Train loss: 0.0448482483625412


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [02:31<00:00, 37.75s/it]

Train loss: 0.05373286455869675



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000

Input.deception_quadrant


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Train loss: 0.043095409870147705


Epoch:  50%|██████████████████████████████████████▌                                      | 2/4 [01:15<01:15, 37.72s/it]

Train loss: 0.0407903678715229


Epoch:  75%|█████████████████████████████████████████████████████████▊                   | 3/4 [01:53<00:37, 37.73s/it]

Train loss: 0.04533509910106659


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [02:30<00:00, 37.74s/it]

Train loss: 0.03251977264881134





              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=56, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=56, test_size=0.2)

In [11]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [12]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [13]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
model.cuda()

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=760.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=467042463.0), HTML(value='')))




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

In [32]:
from tqdm import tqdm, trange

train_loss_set = []

epochs = 4

for _ in trange(epochs, desc="Epoch"):  
    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        b_input_ids = b_input_ids.type(torch.LongTensor)
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item())    
        loss.backward()
        optimizer.step()
    
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|                                                                                     | 0/4 [00:00<?, ?it/s]


RuntimeError: Input, output and indices must be on the current device

In [24]:
model = model.eval()
full_texts = []
predictions = []
prediction_probs = []
real_values = []

with torch.no_grad():
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = torch.tensor(b_input_ids).to(device).long()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        preds = torch.argmax(outputs[0],dim=1)
        
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        real_values.extend(b_labels)

predictions = torch.stack(predictions).cpu()
# prediction_probs = torch.stack(prediction_probs).cpu()
real_values = torch.stack(real_values).cpu()

y_pred = predictions.numpy()
y_test = real_values.numpy()

report = classification_report(y_test, y_pred, output_dict=True)
print(pd.DataFrame(report).transpose())

  # This is added back by InteractiveShellApp.init_path()


              precision    recall  f1-score      support
0              0.000000  0.000000  0.000000    95.000000
1              0.958223  1.000000  0.978666  2179.000000
accuracy       0.958223  0.958223  0.958223     0.958223
macro avg      0.479112  0.500000  0.489333  2274.000000
weighted avg   0.918192  0.958223  0.937781  2274.000000


  _warn_prf(average, modifier, msg_start, len(result))
