In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import re

from newdataset import BilingualDataset, causal_mask


import warnings
from tqdm import tqdm
import os
from pathlib import Path

In [11]:

def prepro_feyn(data):

    for r in (('(', '('), (')', ')'), ('  ', ' '), (' e(', 'e(m_e,-1,' ),(' mu(', 'mu(m_mu,-1,') ,(' u(', ' u(m_u,2/3,'), (' d(', 'd(m_d,-1/3,'), (' t(', ' t(m_t,-1,') ,(' s(', 's(m_s,-1/3,'), (' tt(', ' tt(m_tt,-1,'), (' c(', 'c(m_c,2/3,'),(' b(', 'b(m_b,-1/3,'), ('Anti ', 'Anti,'), ('Off ', 'Off,'), ('  ', ' ') ):
        data = data.replace(*r)

    return data

#preprocessing for the squared amplitudes:
def prepro_squared_ampl(data):

    for r in (('*', '*'), (',', ' , '), ('*(', ' *( ') , ('([', '[ '), ('])', ' ]'), ('[', '[ '), (']', ' ]'), ('[ start ]', '[start]'), ('[ end ]', '[end]'), (' - ', ' -'), (' + ',' +' ) ,('/', ' / ') ,('  ', ' ')) :
        data = data.replace(*r)
    data = re.sub(r"\*(s_\d+\*s_\d+)", r"* \1", data)
    data = re.sub(r"\*(s_\d+\^\d+\*s_\d+)", r"* \1", data)
    data = re.sub(r"\*(m_\w+\^\d+\*s_\d+)", r"* \1", data)
    data = re.sub(r"(m_\w+\^\d+)", r" \1 ", data)
    data = data.replace('  ', ' ')


    return data

def max_len(sq_data):
    l = len(sq_data[sq_data.index(max(sq_data, key=len))].split())
    return l



with open("DATA/fey_qed_order.txt", 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
text_pairs =[]

for line in lines[: min(len(lines), len(lines)-1)]:
    intr, amp, sqamp, t  = line.split('>')
    #sqamp = "[start] " + sqamp + " [end]"
    text_pairs.append((intr, amp,sqamp, float(t) ))

text_pairs = list(set(text_pairs))
print('data size: ', len(text_pairs))

text_pairs_prep = []
for i in range(100):
    text_pairs_prep.append((text_pairs[i][0], prepro_feyn(text_pairs[i][1]),prepro_squared_ampl(text_pairs[i][2] ) , text_pairs[i][3]))

text_pairs = text_pairs_prep

feyn = [pair[1] for pair in text_pairs]
sq_ampl= [pair[2] for pair in text_pairs]

print( 'Maximum sequence length of Feynman diagram        :' ,max_len(feyn))
print( 'Maximum sequence length of squared amplitudes:' ,max_len(sq_ampl))
print('Example sqamp:' , sq_ampl[1])
string=""
for i in sq_ampl[1]:
     string+=i


data size:  257991
Maximum sequence length of Feynman diagram        : 27
Maximum sequence length of squared amplitudes: 137
Example sqamp: [ m_e^6 , m_e^4 , m_e^2 , 1 ] , [ -32 , 8 *( 4*s_13 +3*s_14 +2*s_25 -4*s_34) , -8 *( s_11*s_14 -2* s_11*s_34 +3* s_12*s_45 +2* s_13*s_25 +2* s_13*s_34 +3* s_15*s_24 -4* s_23*s_45 -4* s_24*s_35) , 8 *( s_11* s_12*s_45 +s_11* s_15*s_24 -2* s_11*s_23*s_45 -2* s_11*s_24*s_35 +2* s_13*s_23*s_45 +2* s_13*s_24*s_35) ] , [ ( m_e^2 -s_11 +2*s_13)^2 *( s_22 -2*s_25 +s_55)^2 ]


str

In [None]:
config={
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": './DATA/fey_qed_order.txt',
        "lang_src": 1,
        "lang_tgt": 2,
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }
ds_raw =text_pairs

In [48]:
def get_all_sentences(ds, lang):
    #for item in ds:
    #   yield item[lang]
    for text  in ds:
        for token in text[lang]:
            yield token.split()


def get_or_build_tokenizer(config, ds, lang):
    # giving a language it creates the  config['tokenizer_file']='../tokenizers/tokenizer_{0}.json'
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer



tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

In [50]:
print('Vocab size source: ',tokenizer_src.get_vocab_size())
print('Vocab size target: ',tokenizer_tgt.get_vocab())

Vocab size source:  33
Vocab size target:  {'b': 28, '/': 32, 's': 5, '3': 11, '5': 10, '[EOS]': 3, '4': 8, '-': 12, '[': 19, 'c': 29, ']': 20, '[SOS]': 2, '(': 17, '8': 23, '*': 6, '1': 9, 't': 21, 'e': 26, 'd': 27, '2': 7, '6': 22, '^': 14, 'm': 15, '7': 30, '[PAD]': 1, '0': 31, '9': 25, '_': 4, '+': 13, ',': 16, ')': 18, 'u': 24, '[UNK]': 0}


In [53]:
import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR
import numpy as np

train_ds_size = int(0.9 * len(ds_raw))
val_ds_size = len(ds_raw) - train_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

In [56]:
train_ds.ds

<torch.utils.data.dataset.Subset at 0x22ef1df2490>

2