In [27]:
with open("DECAY_CLASSIC.DEC") as f:
    txt = f.readlines()

Useful resources:
* https://tomassetti.me/parsing-in-python/
* https://github.com/lark-parser/lark/blob/master/docs/lark_cheatsheet.pdf
* https://github.com/lark-parser/lark/blob/9f666a74595ff8ac0f550abbb687a517fe5d495d/lark/grammars/common.lark
* https://gitlab.cern.ch/lhcb-datapkg/Gen/DecFiles/tree/master/dkfiles
* https://github.com/PMunkes/evtgen/blob/master/DECAY.DEC


In [90]:
seen = set()
testlines = ""
for i, t in enumerate(txt[:222]):
    t = t.strip()
    if t and t[0] != "#":
        start = t.split()[0]
        if not start in seen:
            print(i, t)
            seen.add(start)
            testlines += t + "\n"

4 Define qoverp_incohMix_B_s0 1.0
51 Alias K*L       K*0
62 ChargeConj K*L    K*BL
187 yesPhotos
195 Decay Upsilon(4S)
196 0.515122645 B+      B-                                      VSS; #[Reconstructed PDG2011]
197 0.483122645 B0      anti-B0                                 VSS_BMIX dm; #[Reconstructed PDG2011]
198 0.000015583 e+      e-                                      PHOTOS  VLL; #[Reconstructed PDG2011]
199 0.000015766 mu+     mu-                                     PHOTOS  VLL; #[Reconstructed PDG2011]
201 0.000084099 Upsilon(2S) pi+     pi-                         VVPIPI; #[Reconstructed PDG2011]
202 0.000044342 Upsilon(2S) pi0     pi0                         VVPIPI; #[Reconstructed PDG2011]
203 0.000080123 Upsilon pi+     pi-                             VVPIPI; #[Reconstructed PDG2011]
205 0.000194392 Upsilon eta                                     PARTWAVE 0.0 0.0 1.0 0.0 0.0 0.0; #[Reconstructed PDG2011]
208 0.000092625 gamma   chi_b0(3P)                              HEL

In [145]:
examples = set()
models = set()
for t in txt:
    t = t.split("#")[0]
    t = t.strip()
    if ";" in t:
        ts = t.strip(";").split()
        del ts[0]
        while (
            ts[0].upper() != ts[0]
            or "+" in ts[0]
            or "'" in ts[0]
            or ts[0].endswith("-")
            or "*" in ts[0]
            or ts[0].startswith("K_")
            or ts[0] in ("D0", "K0", "B0", "D_10", "D(2S)0")
        ):
            del ts[0]
        if ts[0] == "PHOTOS":
            del ts[0]
        ts = [
            "number" if t.replace(".", "1", 1).lstrip("-").lstrip("+").isdigit() else t
            for t in ts
        ]
        examples.add(" ".join(ts))
        models.add(ts[0])

In [146]:
examples

{'BTO3PI_CP dm alpha',
 'BTOSLLALI',
 'BTOSLLBALL',
 'BTOXSGAMMA number',
 'BTOXSLL number number number number',
 'CB3PI-MPP alpha',
 'CB3PI-P00 alpha',
 'D_DALITZ',
 'ETA_DALITZ',
 'GOITY_ROBERTS',
 'HELAMP number number number number',
 'HELAMP number number number number number number number number',
 'HELAMP number number number number number number number number number number number number',
 'HQET number number number',
 'ISGW2',
 'OMEGA_DALITZ',
 'PARTWAVE number number number number number number',
 'PHSP',
 'PI0_DALITZ',
 'PYTHIA number',
 'SLN',
 'STS',
 'SVP_HELAMP number number number number',
 'SVS',
 'SVV_HELAMP PKHminus PKphHminus PKHzero PKphHzero PKHplus PKphHplus',
 'SVV_HELAMP PKHplus PKphHplus PKHzero PKphHzero PKHminus PKphHminus',
 'SVV_HELAMP number number number number number number',
 'TAUHADNU number number number number number',
 'TAUHADNU number number number number number number number',
 'TAULNUNU',
 'TAUSCALARNU',
 'TAUVECTORNU',
 'TSS',
 'TVS_PWAVE numb

In [217]:
from lark import Lark

In [218]:
for m in sorted(models):
    print(f'"{m}"', end="|")

"BTO3PI_CP"|"BTOSLLALI"|"BTOSLLBALL"|"BTOXSGAMMA"|"BTOXSLL"|"CB3PI-MPP"|"CB3PI-P00"|"D_DALITZ"|"ETA_DALITZ"|"GOITY_ROBERTS"|"HELAMP"|"HQET"|"ISGW2"|"OMEGA_DALITZ"|"PARTWAVE"|"PHSP"|"PI0_DALITZ"|"PYTHIA"|"SLN"|"STS"|"SVP_HELAMP"|"SVS"|"SVV_HELAMP"|"TAUHADNU"|"TAULNUNU"|"TAUSCALARNU"|"TAUVECTORNU"|"TSS"|"TVS_PWAVE"|"VLL"|"VSP_PWAVE"|"VSS"|"VSS_BMIX"|"VUB"|"VVP"|"VVPIPI"|"VVS_PWAVE"|

In [528]:
l = Lark(
    r"""
start : (line | _NEWLINE)+ (_END _NEWLINE*)?
line : (define | alias | chargeconj | commands | decay | cdecay | setlspw) _NEWLINE

_END : "End"

setlspw : "SetLineshapePW" label label label SIGNED_NUMBER

cdecay : "CDecay" label

define : "Define" label SIGNED_NUMBER

alias : "Alias" label label

chargeconj : "ChargeConj" label label

commands : yesphotos | nophotos
yesphotos : "yesPhotos" 
nophotos : "noPhotos"

decay : "Decay" particle _NEWLINE (decayline | _NEWLINE )+ "Enddecay"
decayline : (partdecay | wholedecay) ";" _NEWLINE
partdecay : SIGNED_NUMBER particle+ photos? model
wholedecay : SIGNED_NUMBER photos? model
photos : "PHOTOS"

label : LABEL
particle : LABEL // Add full particle parsing here


model : model_generic
model_generic : MODEL_NAME model_options?
model_options : (SIGNED_NUMBER | LABEL)+

// model_helamp : "HELAMP" (SIGNED_NUMBER SIGNED_NUMBER)+


// Terminal definitions
// To use a fast parser, we need to avoid conflicts

%import common.WS_INLINE
%import common.SIGNED_NUMBER
%import common.NEWLINE -> _NEWLINE  // _ means filter this out of the tree

// We must set priorities here to use lalr - match model name above label, and label above something else
MODEL_NAME.2 : "BTO3PI_CP"|"BTOSLLALI"|"BTOSLLBALL"|"BTOXSGAMMA"|"BTOXSLL"|"CB3PI-MPP"|"CB3PI-P00"|"D_DALITZ"|"ETA_DALITZ"|"GOITY_ROBERTS"|"HELAMP"|"HQET"|"ISGW2"|"OMEGA_DALITZ"|"PARTWAVE"|"PHSP"|"PI0_DALITZ"|"PYTHIA"|"SLN"|"STS"|"SVP_HELAMP"|"SVS"|"SVV_HELAMP"|"TAUHADNU"|"TAULNUNU"|"TAUSCALARNU"|"TAUVECTORNU"|"TSS"|"TVS_PWAVE"|"VLL"|"VSP_PWAVE"|"VSS"|"VSS_BMIX"|"VUB"|"VVP"|"VVPIPI"|"VVS_PWAVE"
LABEL : /[a-zA-Z0-9\/\-+*_()']+/
COMMENT : /[#][^\n]*/

// We should ignore comments
%ignore COMMENT

// Disregard spaces in text
%ignore WS_INLINE
""",
    debug=True,
    parser="lalr",
    lexer="auto",
)

In [529]:
print(testlines)

ll = l.parse(testlines)
print(ll.pretty())

Define qoverp_incohMix_B_s0 1.0
Alias K*L       K*0
ChargeConj K*L    K*BL
yesPhotos
Decay Upsilon(4S)
0.515122645 B+      B-                                      VSS; #[Reconstructed PDG2011]
0.483122645 B0      anti-B0                                 VSS_BMIX dm; #[Reconstructed PDG2011]
0.000015583 e+      e-                                      PHOTOS  VLL; #[Reconstructed PDG2011]
0.000015766 mu+     mu-                                     PHOTOS  VLL; #[Reconstructed PDG2011]
0.000084099 Upsilon(2S) pi+     pi-                         VVPIPI; #[Reconstructed PDG2011]
0.000044342 Upsilon(2S) pi0     pi0                         VVPIPI; #[Reconstructed PDG2011]
0.000080123 Upsilon pi+     pi-                             VVPIPI; #[Reconstructed PDG2011]
0.000194392 Upsilon eta                                     PARTWAVE 0.0 0.0 1.0 0.0 0.0 0.0; #[Reconstructed PDG2011]
0.000092625 gamma   chi_b0(3P)                              HELAMP 1. 0. 1. 0.; #[Reconstructed PDG2011]
0.00013893

In [460]:
print(*[i + 1 for i, t in enumerate(txt) if "Enddecay" in t])

193 222 1485 2702 3709 4711 4727 4730 4733 4736 4739 4742 5215 5662 5680 5697 5763 5773 5778 5782 5786 5791 5795 5942 6084 6088 6276 6281 6462 6563 6662 6669 6673 6677 6681 6696 6700 6704 6708 6713 6717 6721 6725 6742 6748 6754 6760 6768 6773 6779 6785 6789 6793 6799 6805 6813 6817 6821 6825 6831 6837 6843 6849 6865 6872 6876 6882 6888 6894 6900 6908 6911 6914 6917 6920 6923 6929 6934 6939 6943 6948 6952 6964 6976 6988 7000 7010 7020 7030 7040 7053 7066 7079 7092 7105 7114 7123 7132 7144 7153 7162 7171 7175 7179 7183 7187 7192 7196 7206 7218 7222 7225 7228 7242 7254 7278 7283 7287 7291 7301 7311 7323 7328 7332 7336 7343 7347 7351 7359 7369 7389 7396 7409 7421 7431 7437 7444 7450 7455 7461 7484 7507 7530 7553 7576 7599 7622 7645 7682 7690 7842 7940 7965 7993 8028 8086 8131 8183 8196 8200 8207 8231 8253 8278 8289 8297 8314 8332 8341 8364 8403 8407 8416 8427 8459 8472 8489 8497 8512 8531 8541 8545 8548 8551 8554 8661 8666 8669 8676 8681 8684 8688 8691 8711 8730 8734 8737 8743 8748 8752 87

In [461]:
for n, line in enumerate(txt):
    q = line.split("#")[0].strip()
    if q and "SetLineshapePW" in q:
        print(n, q)

6683 SetLineshapePW D_1+ D*+ pi0 2
6684 SetLineshapePW D_1+ D*0 pi+ 2
6685 SetLineshapePW D_1- D*- pi0 2
6686 SetLineshapePW D_1- anti-D*0 pi- 2
6687 SetLineshapePW D_10 D*0 pi0 2
6688 SetLineshapePW D_10 D*+ pi- 2
6689 SetLineshapePW anti-D_10 anti-D*0 pi0 2
6690 SetLineshapePW anti-D_10 D*- pi+ 2
6727 SetLineshapePW D_2*+ D*+ pi0 2
6728 SetLineshapePW D_2*+ D*0 pi+ 2
6729 SetLineshapePW D_2*- D*- pi0 2
6730 SetLineshapePW D_2*- anti-D*0 pi- 2
6731 SetLineshapePW D_2*0 D*0 pi0 2
6732 SetLineshapePW D_2*0 D*+ pi- 2
6733 SetLineshapePW anti-D_2*0 anti-D*0 pi0 2
6734 SetLineshapePW anti-D_2*0 D*- pi+ 2


In [462]:
print("".join(txt[-5:]))

#0.000000007 mu+     mu-     gamma   gamma                   PHSP;  #[New mode added] #[Reconstructed PDG2011]
Enddecay

End




In [530]:
%%time
parsed = l.parse("".join(txt))
bool(parsed)

CPU times: user 1.21 s, sys: 17.9 ms, total: 1.23 s
Wall time: 1.25 s


In [531]:
parsed.children[-1]

Tree(line, [Tree(decay, [Tree(particle, [Token(LABEL, 'K_L0')])])])