# Notebook to review using language models in a classifier with non-text data
This notebook loads relevent libaries, a review dataset (amazon reviews) and demonstrates on how to combine text, categorical data, and numeric data with a transformer model in two ways:  
- Path 1 (XGBOOST): using a transformer model to interpret the text data, as an input to an xgboost classifier with other supporting numeric or categorical information
- Path 2 (BERT): supplimenting the text input with categorical or numeric information at the time of classification within the transformer model itself

In [None]:
%%capture
## capture with jupyter magic to suppress output

## install needed libraries
!pip install xgboost datasets transformers sentence_transformers nltk accelerate evaluate

In [None]:
%%capture

from transformers import pipeline
from datasets import load_dataset,ReadInstruction

## first we'll load some assets to process the data
sentiment_pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

## then we'll load a small subset of a review dataset (200 reviews)
dataset = load_dataset("amazon_polarity", split = ReadInstruction('test', to=0.05, unit='%'))

## Path 1 (XGBOOST):
### numerical/categorical and transformer output into XGBOOST

In [None]:
import pandas as pd

# for intput into an xgboost classifier, first convert to a pandas dataframe
df_revs = pd.DataFrame(dataset)

## as a toy example, let's calculate some basic numeric and categorical data about reviews

# numeric: how many characters are in there review? (more is better?)
df_revs['content_len']=df_revs.content.str.len()

# categorical: does the review have an exclamation point? (excitement can be good! or bad!)
df_revs['has_exclamation_cat'] = df_revs.content.str.contains('!').astype('category')

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# once the toy date are prepared, let's split our data into training and eval subsets
train, test = train_test_split(df_revs, test_size=0.2)
dtrain = xgb.DMatrix(train[['content_len','has_exclamation_cat']], label=train['label'],enable_categorical=True)
dtest = xgb.DMatrix(test[['content_len','has_exclamation_cat']], label=test['label'],enable_categorical=True)

#set a few xgboost params
param = {'max_depth': 2,'eta':0.3}

# and train our toy model
model = xgb.train(param, dtrain, 10)

# in evaluation this toy model typically does poorly (accuracy between 45-65%)
y_pred = model.predict(dtest)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test['label'], predictions)
print("Accuracy: ")
print(" %.2f%%" % (accuracy * 100.0))

Accuracy: 
 65.00%


In [None]:
## to improve, let's add additional variables as calculated from text data
# below we use a transformer-based sentiment analysis model, trained on reviews
# (output: the numeric variable bert_score and the categorical variable bert_stars)
# however converting text to numeric data can also include vector representations of each review (e.g. from bert / mpnet / word2vec embeddings, tf-idf vectors)
# or could be through categorization of the data otherwise (e.g. topic id as a categorical variable from a topic model)

# calculate bert-based sentiment scores
# takes about 2.5min to process 200 records on a standard colab notebook
x=sentiment_pipe(dataset['content'],batch_size=8)

### then - use the transformer-based sentiment output to update our traning and evaluation data
df_revs['bert_stars']=pd.Series([reviews['label'] for reviews in x]).astype('category')
df_revs['bert_score']=pd.Series([reviews['score'] for reviews in x]).astype('float')

In [None]:
## finally - retrain a model with our initial variables and sentiment
# include the vader polarity in the training data
train, test = train_test_split(df_revs, test_size=0.2)
dtrain = xgb.DMatrix(train[['content_len','has_exclamation_cat','bert_score','bert_stars']], label=train['label'],enable_categorical=True)
dtest = xgb.DMatrix(test[['content_len','has_exclamation_cat','bert_score','bert_stars']], label=test['label'],enable_categorical=True)

# train a new model
model = xgb.train(param, dtrain, 10)

# evaluate the output, in terms of accuracy of predicting the input labels
y_pred = model.predict(dtest)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test['label'], predictions)
print("Accuracy: ")
print("  %.2f%%" % (accuracy * 100.0)) #averages between 85-90% in runs

Accuracy: 
  87.50%


## Path 2 (BERT):
### everything (pre-trained transformer's embedding + numerical/categorical data) as an input into last layer of a transformer-based classifier

In [None]:
import torch
from torch import nn
from transformers import AutoConfig, BertModel, BertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from typing import Optional, Union, Tuple

### Create a custom model that predicts labels based on a final hidden state which
### combines the initial model's embedding plus additional numeric/categorical data

# A custom classification head (last layer)
# is required to match the dimensionality
# of the final representation
# combining both text embedding and additional data
# and defined as input num_extra_dims

class ClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims):
        super().__init__()
        total_dims = config.hidden_size+num_extra_dims
        self.dense = nn.Linear(total_dims, total_dims)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(total_dims, config.num_labels)

    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# A custom model configuration class applying
# the custom classification head (with a size to accept our expanded input)
# which can be user specified through the input num_extra_dims

class CustomSequenceClassification(BertForSequenceClassification):

    def __init__(self, config, num_extra_dims):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        # This needs ot be renamed based on base model used
        # e.g. BertModel, RobertaModel, etc
        self.bert =  BertModel(config)

        # and the classifier set to use the ClassificationHead as specified above
        self.classifier = ClassificationHead(config, num_extra_dims)
        self.post_init()

    # note addition of extra_data
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # sequence_output will be (batch_size, seq_length, hidden_size)
        sequence_output = outputs[0]

        # additional data should be (batch_size, num_extra_dims)
        cls_embedding = sequence_output[:, 0, :]

        # note the addition of extra_data concatenated to the sentence embedding
        output = torch.cat((cls_embedding, extra_data), dim=-1)

        logits = self.classifier(output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
from transformers import AutoTokenizer

## import model details from pretrained model used above, including tokenizer
## and use of our custom classifier specified in the above cell
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
new_model = CustomSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment", num_labels=2, num_extra_dims=2)

In [None]:
import numpy as np
from datasets import Dataset

# let's make a customized dataset based on the inputs to xgboost


# first standardize numeric variables
# and one-hot-encode categorical
# and bring together in np.array

extra_numeric = (df_revs.content_len.values-np.mean(df_revs.content_len.values)
                )/np.std(df_revs.content_len.values)

extra_cat = df_revs.has_exclamation_cat.astype(int).values

extra_data = np.stack((extra_numeric,
                       extra_cat)).T

# load this dataset (text, plus 'extra data') from a dict and tokenize with our bert model
ds = Dataset.from_dict({"text": df_revs.content, "extra_data": extra_data, "labels": df_revs.label})
tokenized_ds = ds.map(lambda x: tokenizer(x["text"]))

#split out this data for validation based on our earlier 80/20 split
ds_train_test = tokenized_ds.train_test_split(test_size = 0.2)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# specify arguments for the trainer (where the model output is to be stored)
# and parameters of the trainer, including evaluation data from the train/test split
args = TrainingArguments(output_dir="./",eval_delay=0,evaluation_strategy="epoch")
trainer = Trainer(model=new_model,
                  train_dataset=ds_train_test['train'],
                  eval_dataset=ds_train_test['test'],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                  args=args)

In [None]:
# execute the trainer
# note with CPU alone this takes about 20m
trainer.train()

# perhaps unsuprizingly - this tends to produce similar (but better!) accuracies
# as compared to xgboost with numeric, categorical, and model output
# (this is averaging between 90-100% accuracy while xgb is ~85-95%)

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,7.3e-05,1.0
2,No log,0.270622,0.975
3,No log,0.282162,0.975


TrainOutput(global_step=60, training_loss=9.801937267184258e-05, metrics={'train_runtime': 7.4584, 'train_samples_per_second': 64.357, 'train_steps_per_second': 8.045, 'total_flos': 47249469641088.0, 'train_loss': 9.801937267184258e-05, 'epoch': 3.0})