In [1]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from sentence_transformers.util import cos_sim
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from glob import glob
import re
import pickle, gzip
import sys
from nltk.tokenize import sent_tokenize

sys.path.insert(0,'/home/sulcan/Documents/ipac-logbook/code/')
from mmd import *

i = '_min_max_uncased_'

data_folder = '/home/sulcan/Documents/ipac-logbook/data/data_acc/'
model_folder = f'/home/sulcan/Documents/ipac-logbook/models/simcse/{i}'
max_seq_length = 512
min_seq_length = 16
uncase = True
device = 'cuda:1'

if uncase:
    model_name = "allenai/scibert_scivocab_uncased"
else:
    model_name = "allenai/scibert_scivocab_cased"
    
folders = [f'{data_folder}/arxiv/',\
           f'{data_folder}/jacow/',\
           f'{data_folder}/books/']

files = []
for folder in folders:
    files.extend(glob(folder + '*.mmd'))

In [2]:
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = device)

## Data Preparation

Opening mmd files, filtering equations, and chining sentences (sent_tokenize)

In [3]:
if False:
    # loading data
    print('loading data...')
    data_mmd = {}
    for file in tqdm(sorted(files)):
        with open(file, 'r') as f:
            data_mmd[file] = f.read()
    print('... data loaded')
    
    print('preparing data ...')
    # preparing equations and removing tables
    data_mmd = prepare_mmd_eqations_and_tables_for_simcse(data_mmd)
    print('... data prepared')
    
    print('chunking sentences ...')
    # chunking sentences
    train_sentences = []
    for k in tqdm(data_mmd):
        for par in data_mmd[k].split('\n\n'):
            par = re.sub('#+',' ',par)
            par = re.sub('\s+',' ', par)
            train_sentences.extend(sent_tokenize(par))
    
    with open(f'{data_folder}/simcse_prepared_data.pickle.gzip','wb') as f:
        pickle.dump({'data_mmd' : data_mmd, 'train_sentences' : train_sentences}, f)
    print('...sentences chunked.')
    
else:
    with open(f'{data_folder}/simcse_prepared_data.pickle.gzip','rb') as f:
        data = pickle.load(f)# 
        data_mmd = data['data_mmd']
        train_sentences = data['train_sentences']

Filtering too long or too short tokens (short usually don't contain anything informative)

In [4]:
train_sentences_filtered = []

for i in tqdm(range(len(train_sentences))):
    sent = train_sentences[i]
    length = len(model.tokenizer.encode(sent))
    if length < max_seq_length and length > min_seq_length:
        if uncase:
            sent = sent.lower()
            
        train_sentences_filtered.append(sent)
train_sentences = train_sentences_filtered

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7809227/7809227 [12:40<00:00, 10267.62it/s]


In [5]:
# len(train_sentences)
# import pickle
# with open('/home/sulcan/train_sentences.pickle','rb') as f:
#     train_sentences = pickle.load(f)

In [6]:
# Convert train sentences to sentence pairs
train_data = [InputExample(texts=[s, s]) for s in tqdm(train_sentences)]

# DataLoader to batch your data
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.MultipleNegativesRankingLoss(model)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)], epochs=3, show_progress_bar=True
)

# model.save("output/simcse-model")
if True:
    model.save(model_folder)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4989936/4989936 [00:22<00:00, 224308.71it/s]


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

### Testing / Evaluation

In [8]:
if True:
    word_embedding_model = models.Transformer(model_folder, max_seq_length=128)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda:0')

In [9]:
from sentence_transformers import SentenceTransformer
model2 = SentenceTransformer("all-MiniLM-L6-v2")

In [10]:
def _eval(model, sentences):
    e = model.encode(sentences)
    return cos_sim(e,e)

In [28]:
print(_eval(model2,['ouch, I have a cavity in my tooth', 'superconducting cavity', 'cavity detuned']))

tensor([[1.0000, 0.3864, 0.6862],
        [0.3864, 1.0000, 0.4369],
        [0.6862, 0.4369, 1.0000]])
