In [1]:
import sys
import os
sys.path.append("CompuBERT")

In [2]:
# First you have to install the modified version of sentence-transformers
# You can find this version if you git clone the CompuBERT Repo
# https://github.com/MIR-MU/CompuBERT.git
# All credit goes to MIR-MU team that developed CompuBERT

# First you must install these packages (CompuBERT uses Tensorboard if wanted)
# !pip install lxml annoy pathos gensim==3.4.0
# !pip install git+https://github.com/hbldh/xmlr

# This package also relies on a custom package arqmath_eval
# Download the package via git below
# !pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval

# Uncomment these lines below to download the modified package
# !pip install sentence-transformers

# RESTART KERNEL AFTER INSTALL

In [3]:
import torch
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses, SentencesDataset
from ARQMathCode.post_reader_record import DataReaderRecord
from scipy.stats import zscore
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
from question_answer.utils import examples_from_questions_tup
from question_answer.utils import dataloader_from_examples
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
from preproc.question_answer.blank_substituer import BlankSubstituer
import pickle
from sentence_transformers.datasets import SentenceLabelDataset
from ArqmathEvaluator import ArqmathEvaluator
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import MPNetModel, AutoModel, MPNetTokenizerFast
from transformers import AutoModelForQuestionAnswering, TrainingArguments
from transformers import DefaultDataCollator, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from transformers import AdamW
import evaluate
from tqdm.notebook import tqdm_notebook as tqdm
from datasets import load_metric
from pytorch_optimizer import Nero, Adan

### Process Data Records from XML Dataset

In [4]:
# ARQMath provides a function to read the XML files
# This code runs the provided DataReaderRecord
# Data from XML is stored in a DataReaderRecord object
# Data is automatically loaded based on what files are found in the folder
# Please use the CompuBERT augmented code for the ARQMath dataset
# Found in their github repo: https://github.com/MIR-MU/CompuBERT/

# # UNCOMMENT THIS CODE TO RUN
# data_path = "./data"
# data_records = DataReaderRecord(data_path)

In [5]:
# data_records.post_parser.map_questions[5].body

### Process Posts into Q+A Pairs and Apply Stemming/Pre-Tokenizing

In [6]:
"""
Process Description: 
The Blank Substituter removes the <math> and other html tags from the text
The postprocessor is found in the CompuBERT module.

This function call splits the data_records and preprocesses the text.
It also creates a label or weight for each q+a pair.
The weight is calculated via a normalized score based on the number of up
and down votes the answer received.

NOTE: One change was made to the post processor, the labels had to be
coerced into a float 
"""

# # UNCOMMENT THIS CODE TO RUN
# postprocessor = BlankSubstituer()
# postproc_parser = postprocessor.process_parser(data_records.post_parser)

'\nProcess Description: \nThe Blank Substituter removes the <math> and other html tags from the text\nThe postprocessor is found in the CompuBERT module.\n\nThis function call splits the data_records and preprocesses the text.\nIt also creates a label or weight for each q+a pair.\nThe weight is calculated via a normalized score based on the number of up\nand down votes the answer received.\n\nNOTE: One change was made to the post processor, the labels had to be\ncoerced into a float \n'

### Load pre-saved Data (Pickled)

Simply for ease of use.

In [7]:
# with open('postprocessor.pickle', 'wb') as handle:
#     _=pickle.dump(postproc_parser, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# with open('postprocessor.pickle', 'rb') as handle:
#     postproc_parser = pickle.load(handle)

In [9]:
# This places all the q+a posts into a tuple of (question-text, answer-text)
# This allows the encoder to vectorize and relate the q+a data samples
# all_data = list(examples_from_questions_tup(postproc_parser.map_questions.items()))

In [10]:
# with open("all_data.pickle", "wb") as handle:
#     _=pickle.dump(all_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open("all_data.pickle", "rb") as handle:
    all_data = pickle.load(handle)
dataset_size = len(all_data)

### Split Data into Dev and Train set

In [12]:
# We split the dataset into a test and dev set
X_train, X_dev = train_test_split(all_data, 
                                  train_size=0.8, 
                                  random_state=22,
                                  shuffle=True)

### Load Model from HuggingFace

In [13]:
# # CompuBERT Base Model
# device = "cuda"
# model = SentenceTransformer("bert-base-wikipedia-sections-mean-tokens", 
#                             device="cuda")

In [14]:
# This is a new updated model that is said to perform better than
# the base model above
device = "cuda"
# model_name = "distilbert-base-uncased"
# tokenizer_name = "AnReu/albert-for-arqmath-3"
tokenizer_name = "AnReu/math_pretrained_bert"
model_name = "AnReu/math_pretrained_bert"

# # This model was used in a more recent paper for ARQMath that had a much better
# # performance. This model is pretrained and I wanted to work on training my own
# # model
# model = AutoModelForSequenceClassification.from_pretrained("AnReu/albert-for-arqmath-3")

# # Tokenizer: Tokenizers are a great way to preprocess data
# # In reference to the paper above, a tokenizer was created for the ARQmath
# # dataset. I use that tokenizer below here in some of my later testing
# math_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Tokenizer from pretrained
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.resize_token_embeddings(len(tokenizer))
_=model.to(device)

Some weights of the model checkpoint at AnReu/math_pretrained_bert were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

### Create A Pytorch Datset

In [15]:
def preprocess_data(question, answer):
    return tokenizer(question, answer,
                     padding='max_length',
                     truncation=True,
                     max_length=512)

class MathIRDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        tokens = list(map(lambda x: preprocess_data(x.texts[0], x.texts[1]), dataset))
        labels = list(map(lambda x: float(x.label), dataset))
        self.encodings = []
        
        for token, label_curr in zip(tokens, labels):
            input_ids = token['input_ids']
            token_type_ids = token["token_type_ids"]
            att_masks = token["attention_mask"]
            label = label_curr
            encoding = {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "attention_mask": torch.tensor(att_masks, dtype=torch.long),
                "labels": torch.tensor(label, dtype=torch.long) 
            }
            self.encodings.append(encoding)

    def __getitem__(self, idx):
        return self.encodings[idx]

    def __len__(self):
        return len(self.encodings)

In [16]:
# Subseting the Data
# You can opt to only take a subset of the data by changing X_train
# to X_train[0:n]
X_train_set = X_train[0:30000]
X_dev_set = X_dev[0:10000]

In [17]:
train_dataset = MathIRDataset(X_train_set)
val_dataset = MathIRDataset(X_dev_set)

# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)
# val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [18]:
del all_data
del X_train
del X_dev

### Inititialize the Model Parameters

In [20]:
# Trying out different Optimizers
# optim = Adan(model.parameters(), lr=5e-7)
# optim_sched = torch.optim.lr_scheduler.ExponentialLR(optim, 0.9)


metrics = evaluate.combine(["accuracy", "precision", "f1"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
_=model.train()

### Initialize Training Parameters and Fine-Tune (Train) Model

In [21]:
num = 1
num_epochs = 50

model_name_path = model_name.replace("/","_")
training_args = TrainingArguments(f"./Math-IR-Adamw-5e-5-{model_name_path}-{num}", 
                                  evaluation_strategy="epoch",
                                  # warmup_steps = 500,
                                  per_device_train_batch_size=6,
                                  per_device_eval_batch_size=6,
                                  save_strategy = "epoch",
                                  seed=22,
                                  learning_rate=5e-5,
                                  weight_decay = 0.1,
                                  num_train_epochs=num_epochs)

trainer = Trainer(
    model=model,
    args=training_args,
    # optimizers=(optim, optim_sched),
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 30000
  Num Epochs = 50
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 250000
  Number of trainable parameters = 108696578


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

### Old Model Training Code

In [None]:
# This is Unused code to train using the sentence-transformers module


# Create the Sentence label dataset
train_data = SentenceLabelDataset(X_train_set, 2)

# # Create the sentences for the evaluator
# # The dataset we use for the evaluator is the dev set
# # The format for the evaluator changed since the last CompuBERT updated
# # I augmented the code to account for this change in the new
# # sentence-transformers package
# sentences_1 = [tokenizer(x.texts[0]) for x in X_dev_set]
# sentences_2 = [tokenizer(x.texts[1]) for x in X_dev_set]
# scores = [float(x.label) for x in X_dev_set]

# Load the data loader for pytorch training
# train_loader = DataLoader(dataset, batch_size=10, shuffle=False)
# train_loss = losses.CosineSimilarityLoss(model=model)

# # Init the evaluator
# # The evaluator will automatically write results to a folder
# evaluator = ArqmathEvaluator(model, sentences_1, sentences_2, scores,
#                              batch_size=10,
#                              post_parser_postproc=postproc_parser,
#                              name = "ARQMathEvaluator",
#                              show_progress_bar=True,
#                              write_csv=True
#                             )

# # Run the experiment
# experiment_num = 2
# model.compile(optimizer=Adam(3e-5))
# model.fit(train_objectives=[(train_loader, train_loss)],
#           evaluator=evaluator,
#           epochs=10,
#           evaluation_steps=50,
#           output_path=f"Math-IR-System-Experiment-{experiment_num}",
#           optimizer_params={'lr': 2e-5, 'eps': 1e-6})