In [3]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from sentence_transformers.util import cos_sim
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from glob import glob
import re
import pickle, gzip
import sys
from nltk.tokenize import sent_tokenize

sys.path.insert(0,'/home/sulcan/Documents/ipac-logbook/code/')
from mmd import *

i = '_min_max_uncased_'

data_folder = '/home/sulcan/Documents/ipac-logbook/data/data_acc/'
model_folder = f'/home/sulcan/Documents/ipac-logbook/models/simcse/{i}'
max_seq_length = 512
min_seq_length = 16
uncase = True
device = 'cuda:0'

if uncase:
    model_name = "allenai/scibert_scivocab_uncased"
else:
    model_name = "allenai/scibert_scivocab_cased"
    
folders = [f'{data_folder}/arxiv/',\
           f'{data_folder}/jacow/',\
           f'{data_folder}/books/']

files = []
for folder in folders:
    files.extend(glob(folder + '*.mmd'))

In [4]:
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = device)

## Data Preparation

Opening mmd files, filtering equations, and chining sentences (sent_tokenize)

In [3]:
if False:
    # loading data
    print('loading data...')
    data_mmd = {}
    for file in tqdm(sorted(files)):
        with open(file, 'r') as f:
            data_mmd[file] = f.read()
    print('... data loaded')
    
    print('preparing data ...')
    # preparing equations and removing tables
    data_mmd = prepare_mmd_eqations_and_tables_for_simcse(data_mmd)
    print('... data prepared')
    
    print('chunking sentences ...')
    # chunking sentences
    train_sentences = []
    for k in tqdm(data_mmd):
        for par in data_mmd[k].split('\n\n'):
            par = re.sub('#+',' ',par)
            par = re.sub('\s+',' ', par)
            train_sentences.extend(sent_tokenize(par))
    
    with open(f'{data_folder}/simcse_prepared_data.pickle.gzip','wb') as f:
        pickle.dump({'data_mmd' : data_mmd, 'train_sentences' : train_sentences}, f)
    print('...sentences chunked.')
    
else:
    with open(f'{data_folder}/simcse_prepared_data.pickle.gzip','rb') as f:
        data = pickle.load(f)# 
        data_mmd = data['data_mmd']
        train_sentences = data['train_sentences']

Filtering too long or too short tokens (short usually don't contain anything informative)

In [4]:
train_sentences_filtered = []

for i in tqdm(range(len(train_sentences))):
    sent = train_sentences[i]
    length = len(model.tokenizer.encode(sent))
    if length < max_seq_length and length > min_seq_length:
        if uncase:
            sent = sent.lower()
            
        train_sentences_filtered.append(sent)
train_sentences = train_sentences_filtered

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7809227/7809227 [12:40<00:00, 10267.62it/s]


In [5]:
# len(train_sentences)
# import pickle
# with open('/home/sulcan/train_sentences.pickle','rb') as f:
#     train_sentences = pickle.load(f)

In [6]:
# Convert train sentences to sentence pairs
train_data = [InputExample(texts=[s, s]) for s in tqdm(train_sentences)]

# DataLoader to batch your data
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.MultipleNegativesRankingLoss(model)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)], epochs=3, show_progress_bar=True
)

# model.save("output/simcse-model")
if True:
    model.save(model_folder)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4989936/4989936 [00:22<00:00, 224308.71it/s]


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

Iteration:   0%|          | 0/311871 [00:00<?, ?it/s]

### Testing / Evaluation

In [12]:
word_embedding_model = models.Transformer(model_folder, max_seq_length=128)
ooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model1 = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda:0')

In [13]:
from sentence_transformers import SentenceTransformer
# model2 = SentenceTransformer("all-MiniLM-L6-v2")
model2 = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [33]:
def _eval(model, sents):
    e = model.encode(sents)
    return cos_sim(e,e)

In [16]:
sentences = ['ouch, I have a cavity in my tooth', 'superconducting cavity', 'cavity detuned']
print('ours')
print(_eval(model1,sentences))
print('their')
print(_eval(model2,sentences))

ours
tensor([[1.0000, 0.1714, 0.1250],
        [0.1714, 1.0000, 0.6175],
        [0.1250, 0.6175, 1.0000]])
their
tensor([[1.0000, 0.4815, 0.7118],
        [0.4815, 1.0000, 0.6446],
        [0.7118, 0.6446, 1.0000]])


In [22]:
sentences = ['radiation', 'synchroton', 'problem']
print('ours')
print(_eval(model1,sentences))
print('their')
print(_eval(model2,sentences))

ours
tensor([[1.0000, 0.2472, 0.1704],
        [0.2472, 1.0000, 0.1264],
        [0.1704, 0.1264, 1.0000]])
their
tensor([[1.0000, 0.5968, 0.5626],
        [0.5968, 1.0000, 0.5414],
        [0.5626, 0.5414, 1.0000]])


In [24]:
sentences = [
    'Accelerating metal tube making electrons running faster?',
    'Superconducting cavity',
    'RF Cavity',
    'Accelerator',
    'Beam Tube']

print('ours')
print(_eval(model1,sentences))
print('their')
print(_eval(model2,sentences))

ours
tensor([[1.0000, 0.3202, 0.1785, 0.1457, 0.1953],
        [0.3202, 1.0000, 0.4263, 0.1973, 0.3299],
        [0.1785, 0.4263, 1.0000, 0.3675, 0.4771],
        [0.1457, 0.1973, 0.3675, 1.0000, 0.4892],
        [0.1953, 0.3299, 0.4771, 0.4892, 1.0000]])
their
tensor([[1.0000, 0.6057, 0.5634, 0.6982, 0.6396],
        [0.6057, 1.0000, 0.8295, 0.5831, 0.6187],
        [0.5634, 0.8295, 1.0000, 0.5835, 0.6225],
        [0.6982, 0.5831, 0.5835, 1.0000, 0.6210],
        [0.6396, 0.6187, 0.6225, 0.6210, 1.0000]])


In [37]:
sentences = [
    'BESSY',
    'DESY',
    'European XFEL',
    'PETRA'
    'HZB',
    'synchroton',
    'linac']
print('ours')
print(_eval(model1,sentences))
print('their')
print(_eval(model2,sentences))

ours
tensor([[1.0000, 0.8607, 0.4715, 0.4525, 0.3635, 0.3478],
        [0.8607, 1.0000, 0.3727, 0.3972, 0.3129, 0.3295],
        [0.4715, 0.3727, 1.0000, 0.3588, 0.2108, 0.1905],
        [0.4525, 0.3972, 0.3588, 1.0000, 0.1495, 0.2868],
        [0.3635, 0.3129, 0.2108, 0.1495, 1.0000, 0.2724],
        [0.3478, 0.3295, 0.1905, 0.2868, 0.2724, 1.0000]])
their
tensor([[1.0000, 0.6652, 0.3914, 0.4542, 0.4560, 0.4527],
        [0.6652, 1.0000, 0.4791, 0.5385, 0.4785, 0.5301],
        [0.3914, 0.4791, 1.0000, 0.5250, 0.5296, 0.6150],
        [0.4542, 0.5385, 0.5250, 1.0000, 0.5355, 0.5744],
        [0.4560, 0.4785, 0.5296, 0.5355, 1.0000, 0.6121],
        [0.4527, 0.5301, 0.6150, 0.5744, 0.6121, 1.0000]])
