In [1]:
import sys
import os
sys.path.append("CompuBERT")

In [2]:
# First you have to install the modified version of sentence-transformers
# You can find this version if you git clone the CompuBERT Repo
# https://github.com/MIR-MU/CompuBERT.git
# All credit goes to MIR-MU team that developed CompuBERT

# First you must install these packages (CompuBERT uses Tensorboard if wanted)
# !pip install lxml annoy pathos gensim==3.4.0
# !pip install git+https://github.com/hbldh/xmlr

# This package also relies on a custom package arqmath_eval
# Download the package via git below
# !pip install --force-reinstall git+https://github.com/MIR-MU/ARQMath-eval

# Uncomment these lines below to download the modified package
# !pip install sentence-transformers

# RESTART KERNEL AFTER INSTALL

In [3]:
import torch
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses, SentencesDataset
from ARQMathCode.post_reader_record import DataReaderRecord
from scipy.stats import zscore
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
from question_answer.utils import examples_from_questions_tup
from question_answer.utils import dataloader_from_examples
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
from preproc.question_answer.blank_substituer import BlankSubstituer
import pickle
from sentence_transformers.datasets import SentenceLabelDataset
from ArqmathEvaluator import ArqmathEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ARQMath provides a function to read the XML files
# This code runs the provided DataReaderRecord
# Data from XML is stored in a DataReaderRecord object
# Data is automatically loaded based on what files are found in the folder
# Please use the CompuBERT augmented code for the ARQMath dataset
# Found in their github repo: https://github.com/MIR-MU/CompuBERT/

# UNCOMMENT THIS CODE TO RUN
data_path = "./data"
data_records = DataReaderRecord(data_path)

reading posts


In [5]:
"""
Process Description: 
The Blank Substituter removes the <math> and other html tags from the text
The postprocessor is found in the CompuBERT module.

This function call splits the data_records and preprocesses the text.
It also creates a label or weight for each q+a pair.
The weight is calculated via a normalized score based on the number of up
and down votes the answer received.

NOTE: One change was made to the post processor, the labels had to be
coerced into a float 
"""

# UNCOMMENT THIS CODE TO RUN
postprocessor = BlankSubstituer()
postproc_parser = postprocessor.process_parser(data_records.post_parser)

Parsing questions: Original math notation: 100%|█████████████████████████████████████████████████████████████████████████████| 1020585/1020585 [05:01<00:00, 3385.91it/s]
Parsing answers: Original math notation: 100%|██████████████████████████████████████████████████████████████████████████████| 1445495/1445495 [00:40<00:00, 35479.98it/s]
Replacing questions answers: Original math notation: 100%|███████████████████████████████████████████████████████████████████| 868130/868130 [00:01<00:00, 736033.20it/s]


In [6]:
# with open('postprocessor.pickle', 'wb') as handle:
#     _=pickle.dump(postproc_parser, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# with open('postprocessor.pickle', 'rb') as handle:
#     postproc_parser = pickle.load(handle)

In [9]:
# This places all the q+a posts into a tuple of (question-text, answer-text)
# This allows the encoder to vectorize and relate the q+a data samples
all_data = list(examples_from_questions_tup(postproc_parser.map_questions.items()))

In [8]:
# with open("all_data.pickle", "wb") as handle:
#     _=pickle.dump(all_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
# with open("all_data.pickle", "rb") as handle:
#     all_data = pickle.load(handle)
# dataset_size = len(all_data)

In [24]:
# We split the dataset into a test and dev set
X_train, X_dev = train_test_split(all_data, 
                                  train_size=0.9, 
                                  random_state=22,
                                  shuffle=True)

In [12]:
# # CompuBERT Base Model
# device = "cuda"
# model = SentenceTransformer("bert-base-wikipedia-sections-mean-tokens", 
#                             device="cuda")

In [25]:
# This is a new updated model that is said to perform better than
# the base model above
device = "cuda"
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device="cuda")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [37]:
# You can opt to only take a subset of the data by changing X_train
# to X_train[0:n]
X_train_set = X_train[0:1000]
X_dev_set = X_dev[0:1000]

# Create the Sentence label dataset
train_data = SentenceLabelDataset(X_train_set, 2)

In [38]:
# Load the data loader for pytorch training
train_loader = DataLoader(train_data, batch_size=10, shuffle=False)
train_loss = losses.CosineSimilarityLoss(model=model)

In [39]:
# Create the sentences for the evaluator
# The dataset we use for the evaluator is the dev set
# The format for the evaluator changed since the last CompuBERT updated
# I augmented the code to account for this change in the new
# sentence-transformers package
sentences_1 = [x.texts[0] for x in X_dev_set]
sentences_2 = [x.texts[1] for x in X_dev_set]
scores = [float(x.label) for x in X_dev_set]

In [40]:
# Init the evaluator
# The evaluator will automatically write results to a folder
evaluator = ArqmathEvaluator(model, sentences_1, sentences_2, scores,
                             batch_size=10,
                             post_parser_postproc=postproc_parser,
                             name = "ARQMathEvaluator",
                             show_progress_bar=True,
                             write_csv=True
                            )

In [21]:
## Removing Unneccessary Variables
del all_data
del X_train
del X_dev

In [41]:
# Run the experiment
experiment_num = 1
model.fit(train_objectives=[(train_loader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          evaluation_steps=1280,
          warmup_steps=5000,
          output_path=f"Math-IR-System-Experiment-{experiment_num}",
          optimizer_params={'lr': 2e-5, 'eps': 1e-6})

Epoch:   0%|                                                                                                                                      | 0/10 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                  | 0/95 [00:00<?, ?it/s][A
Iteration:   1%|█▎                                                                                                                        | 1/95 [00:00<00:44,  2.12it/s][A
Iteration:   2%|██▌                                                                                                                       | 2/95 [00:00<00:27,  3.34it/s][A
Iteration:   3%|███▊                                                                                                                      | 3/95 [00:00<00:23,  3.99it/s][A
Iteration:   4%|█████▏                                                                                                                    