In [1]:
# @title
# %% [markdown]
# # Parser MT950 în Google Colab
# **Obiectiv:** Construim un pipeline hibrid pentru mesaje SWIFT **MT950**:
# - Extragem câmpurile standard (:20:, :25:, :28C:, :60F/M:, :62F/M:, :64:) prin **reguli deterministe** (regex);
# - Antrenăm o **rețea neuronală** (BiLSTM pentru etichetare la nivel de caracter) care parsează înregistrările complicate **`:61:`** (data, D/C, sumă, cod, referință).
#
# Notebook-ul este self‑contained: are date sintetice de antrenare, dar poți încărca ușor propriile mesaje.
#
# ---

# %%
# !pip install tensorflow==2.16.1 scikit-learn pandas numpy regex==2024.4.16 --quiet

import re
import json
import math
import random
import numpy as np
import pandas as pd
from typing import List, Dict, Any

import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split

print(tf.__version__)

# %% [markdown]
# ## 1) Mesaje MT950 – date demo (poți înlocui cu ale tale)
#
# *Notă:* Structura MT950 e apropiată de MT940. În practică, etichetele pot varia ușor între bănci.

# %%
DEMO_MESSAGES = [
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924D23,45NTRFNONREF//REF1
        :61:2509240924C120,00NTRFNONREF//REF2
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01CCCCZZZZAXXX0000000000}{2:I950DDDDZZZZXXXXN}{4:
        :20:ABCD98765
        :25:GB12BARC20040123456789
        :28C:00002/001
        :60M:C250901USD5000,
        :61:2509020902C250,50NCHKNONREF//INV001
        :61:2509030903D75,00NCHKNONREF//INV002
        :62M:C250903USD5175,50
        :64:C250903USD5175,50
        -}
        """.strip()
    ),
    (
        """
        {1:F01EEEEZZZZAXXX0000000000}{2:I950FFFFZZZZXXXXN}{4:
        :20:XYZ000111
        :25:DE89370400440532013000
        :28C:00003/001
        :60F:C250930EUR100,00
        :61:2510011001C900,00NTRFNONREF//PAYR001
        :62F:C251001EUR1000,00
        :64:C251001EUR1000,00
        -}
        """.strip()
    ),
]

# %% [markdown]
# ## 2) Parser bazat pe reguli pentru tag-urile standard
# Extragem câmpuri cheie într-un dict. Înregistrările `:61:` le păstrăm separat pentru rețeaua neuronală.

# %%
TAG_PATTERN = re.compile(r"^:(\d{2}[A-Z]?):(.*)$")

balance_pat = re.compile(r"^(?P<sign>[CD])(?P<date>\d{6})(?P<ccy>[A-Z]{3})(?P<amt>[0-9,]+)$")

# Structura minimală pentru o linie :61:
#  YYMMDD[entry]D/CamountN<code><ref>//<ref2>
line61_pat = re.compile(r"^(?P<valdate>\d{6})(?P<entry>\d{4})?(?P<dc>[DC])(?P<amt>[0-9,]+)N(?P<tx>\w+)(?P<rest>.*)$")

def parse_mt950_rules(raw: str) -> Dict[str, Any]:
    lines = [ln.strip() for ln in raw.splitlines()]
    in_block4 = False
    tags = []
    current = None
    for ln in lines:
        if ln.startswith(":"):
            m = TAG_PATTERN.match(ln)
            if m:
                if current:
                    tags.append(current)
                current = {"tag": m.group(1), "value": m.group(2)}
            else:
                if current:
                    current["value"] += "\n" + ln
        elif ln.startswith("{4:"):
            in_block4 = True
        elif ln.startswith("-}"):
            in_block4 = False
            if current:
                tags.append(current)
                current = None
        else:
            if in_block4 and current:
                current["value"] += "\n" + ln
    if current:
        tags.append(current)

    result = {
        "transaction_reference": None,
        "account": None,
        "statement_number": None,
        "opening_balance": None,
        "closing_balance": None,
        "available_balance": None,
        "records_61": [],  # brut, pentru NN
    }

    for t in tags:
        tag = t["tag"].upper()
        val = t["value"].strip()
        if tag == "20":
            result["transaction_reference"] = val
        elif tag == "25":
            result["account"] = val
        elif tag == "28C":
            result["statement_number"] = val
        elif tag in ("60F", "60M"):
            m = balance_pat.match(val)
            if m:
                result["opening_balance"] = m.groupdict()
        elif tag in ("62F", "62M"):
            m = balance_pat.match(val)
            if m:
                result["closing_balance"] = m.groupdict()
        elif tag == "64":
            m = balance_pat.match(val)
            if m:
                result["available_balance"] = m.groupdict()
        elif tag == "61":
            # colectăm pentru NN (una pe linie)
            for sub in val.splitlines():
                sub = sub.strip()
                if sub:
                    result["records_61"].append(sub)
    return result

# Test rapid
for i, msg in enumerate(DEMO_MESSAGES, 1):
    parsed = parse_mt950_rules(msg)
    print(f"MSG {i}: acct={parsed['account']} open={parsed['opening_balance']} close={parsed['closing_balance']} 61_count={len(parsed['records_61'])}")

# %% [markdown]
# ## 3) Rețea neuronală pentru etichetarea caracterelor din `:61:`
#
# Vom face NER la nivel de **caracter** cu schema BIO pe următoarele entități:
# - `DATE` (YYMMDD), `ENTRY` (opțional), `DC` (D/C), `AMT` (sumă), `CODE` (ex. NTRF), `REF` (restul).
#
# Setul de antrenare e **sintetic** (bazat pe tipare uzuale). Înlocuiește/completează cu date reale anonimizate pentru rezultate mai bune.

# %%
LABELS = ["O", "B-DATE", "I-DATE", "B-ENTRY", "I-ENTRY", "B-DC", "I-DC", "B-AMT", "I-AMT", "B-CODE", "I-CODE", "B-REF", "I-REF"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Generator simplu de linii :61: și etichete BIO (caracter-level)

def synth61_samples(n=400):
    codes = ["NTRF","NCHQ","NCHK","NMSC","NCMZ"]
    refs  = ["NONREF//REF1","NONREF//INV001","PAYR001","FEES//F123","SAL//OCT"]
    samples = []
    for _ in range(n):
        yymmdd = f"25{random.randint(1,12):02d}{random.randint(1,28):02d}"
        entry  = f"{random.randint(1,12):02d}{random.randint(1,28):02d}"
        dc     = random.choice(["D","C"])
        amt    = f"{random.randint(1,2000)},{random.randint(0,99):02d}"
        code   = random.choice(codes)
        ref    = random.choice(refs)
        line   = f"{yymmdd}{entry}{dc}{amt}{code}{ref}"
        labels = ["O"]*len(line)
        def tag_span(start, end, b, i):
            labels[start] = b
            for k in range(start+1, end):
                labels[k] = i
        # mark spans determinist prin regex-uri
        m = re.match(r"(\d{6})(\d{4})([DC])([0-9,]+)(\w+)(.*)$", line)
        if not m:
            continue
        s0 = 0; e0 = 6
        s1 = 6; e1 = 10
        s2 = 10; e2 = 11
        # amount: după D/C până la terminarea cifrelor/virgulei
        amt_m = re.match(r"[0-9,]+", line[e2:])
        s3 = e2; e3 = e2 + (amt_m.end() if amt_m else 0)
        # code: [A-Za-z]+
        code_m = re.match(r"[A-Za-z]+", line[e3:])
        s4 = e3; e4 = e3 + (code_m.end() if code_m else 0)
        # rest = referință
        s5 = e4; e5 = len(line)

        tag_span(s0,e0,"B-DATE","I-DATE")
        tag_span(s1,e1,"B-ENTRY","I-ENTRY")
        tag_span(s2,e2,"B-DC","I-DC")
        tag_span(s3,e3,"B-AMT","I-AMT")
        tag_span(s4,e4,"B-CODE","I-CODE")
        if s5 < e5:
            tag_span(s5,e5,"B-REF","I-REF")
        samples.append((line, labels))
    return samples

samples = synth61_samples(600)
len(samples)

# %%
# Vocabular de caractere
chars = sorted({ch for s,_ in samples for ch in s} | set(" :/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.'+-_"))
char2id = {c:i+1 for i,c in enumerate(chars)}  # 0 = PAD
id2char = {i:c for c,i in char2id.items()}

MAX_LEN = max(len(s) for s,_ in samples)

X = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
Y = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
for i,(s,labels) in enumerate(samples):
    ids = [char2id.get(ch,0) for ch in s]
    X[i,:len(ids)] = ids
    Y[i,:len(labels)] = [label2id[l] for l in labels]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

# %% [markdown]
# ### Model: Embedding + BiLSTM + TimeDistributed

# %%
EMB = 64
HID = 96
NUM_LABELS = len(LABELS)
VOCAB = len(char2id) + 1

inp = layers.Input(shape=(MAX_LEN,), dtype="int32")
emb = layers.Embedding(VOCAB, EMB, mask_zero=True)(inp)
bi  = layers.Bidirectional(layers.LSTM(HID, return_sequences=True))(emb)
out = layers.TimeDistributed(layers.Dense(NUM_LABELS, activation="softmax"))(bi)
model = Model(inp, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# pregătim țintele ca (batch, seq, 1)
Y_train_ = np.expand_dims(Y_train, -1)
Y_val_   = np.expand_dims(Y_val, -1)

hist = model.fit(X_train, Y_train_, validation_data=(X_val, Y_val_), epochs=8, batch_size=32, verbose=1)

# %% [markdown]
# ## 4) Inference pentru o linie `:61:`

# %%

def predict_tags(line: str):
    arr = np.zeros((1, MAX_LEN), dtype=np.int32)
    ids = [char2id.get(ch,0) for ch in line]
    arr[0,:len(ids)] = ids
    probs = model.predict(arr, verbose=0)[0]
    pred = probs.argmax(-1)[:len(ids)]
    return [id2label[i] for i in pred]


def spans_from_bio(text: str, tags: List[str]) -> Dict[str, str]:
    out = {"DATE":"", "ENTRY":"", "DC":"", "AMT":"", "CODE":"", "REF":""}
    cur = None; buf = []
    def flush():
        nonlocal cur, buf
        if cur and buf:
            out[cur] += "".join(buf)
        cur = None; buf = []
    for ch, tg in zip(text, tags):
        if tg.startswith("B-"):
            flush()
            cur = tg[2:]
            buf = [ch]
        elif tg.startswith("I-") and cur == tg[2:]:
            buf.append(ch)
        else:
            flush()
    flush()
    for k in out:
        out[k] = out[k].strip()
    return out

# Test pe liniile :61: din mesajele demo
for msg in DEMO_MESSAGES:
    p = parse_mt950_rules(msg)
    for ln in p["records_61"]:
        tg = predict_tags(ln)
        spans = spans_from_bio(ln, tg)
        print(ln)
        print(spans)
        print("---")

# %% [markdown]
# ## 5) Asamblare end‑to‑end: din mesaj MT950 în JSON structurat

# %%

def parse_amount_to_float(s: str) -> float:
    s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except:
        return math.nan


def enrich_61_with_nn(records: List[str]) -> List[Dict[str, Any]]:
    out = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)
        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "reference": spans.get("REF") or None,
        }
        out.append(item)
    return out


def parse_mt950_full(raw: str) -> Dict[str, Any]:
    base = parse_mt950_rules(raw)
    tx = enrich_61_with_nn(base["records_61"]) if base.get("records_61") else []
    base["transactions"] = tx
    return base

# Rulează pe toate mesajele demo
structured = [parse_mt950_full(m) for m in DEMO_MESSAGES]
print(json.dumps(structured, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 6) Cum încarci propriile tale mesaje
#
# - Dacă ai un **fișier** cu mai multe mesaje concatenate, citește-l și separă-le după delimiterul `-}`.
# - Alternativ, pune fiecare mesaj într-un element al unei liste și rulează `parse_mt950_full`.

# %%
# Exemplu de încărcare din fișier (decomentează și adaptează calea):
# with open('/content/mt950.txt', 'r', encoding='utf-8') as f:
#     raw_all = f.read()
# raw_msgs = [blk.strip()+"\n-}" for blk in raw_all.split('-}') if blk.strip()]
# parsed = [parse_mt950_full(m) for m in raw_msgs]
# print(json.dumps(parsed, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 7) Observații și extensii
# - **Antrenare pe date reale**: Calitatea parse‑ului la `:61:` depinde de datele de antrenare. Înlocuiește `synth61_samples` cu linii reale anonimizate.
# - **Câmpuri suplimentare**: Poți adăuga etichete BIO noi (ex. contrapartidă, text liber după `//`).
# - **Generalizare**: Pentru performanță mai bună, crește `epochs`, diversifică pattern‑urile și normalizează sumele/valutele.
# - **Validare**: Adaugă teste unitare pentru fiecare tag.
# - **Export**: Salvează JSON-ul rezultat în fișiere sau într-o bază de date.


2.19.0
MSG 1: acct=RO49AAAA1B31007593840000 open={'sign': 'C', 'date': '250923', 'ccy': 'EUR', 'amt': '1234,56'} close={'sign': 'C', 'date': '250924', 'ccy': 'EUR', 'amt': '1331,11'} 61_count=2
MSG 2: acct=GB12BARC20040123456789 open={'sign': 'C', 'date': '250901', 'ccy': 'USD', 'amt': '5000,'} close={'sign': 'C', 'date': '250903', 'ccy': 'USD', 'amt': '5175,50'} 61_count=2
MSG 3: acct=DE89370400440532013000 open={'sign': 'C', 'date': '250930', 'ccy': 'EUR', 'amt': '100,00'} close={'sign': 'C', 'date': '251001', 'ccy': 'EUR', 'amt': '1000,00'} 61_count=1


Epoch 1/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 69ms/step - accuracy: 0.3697 - loss: 2.4197 - val_accuracy: 0.5133 - val_loss: 1.5664
Epoch 2/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5940 - loss: 1.3457 - val_accuracy: 0.6769 - val_loss: 0.8327
Epoch 3/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.6968 - loss: 0.7198 - val_accuracy: 0.7151 - val_loss: 0.4827
Epoch 4/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7496 - loss: 0.4234 - val_accuracy: 0.8068 - val_loss: 0.3034
Epoch 5/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8262 - loss: 0.2701 - val_accuracy: 0.8318 - val_loss: 0.2125
Epoch 6/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.8532 - loss: 0.1873 - val_accuracy: 0.8466 - val_loss: 0.1580
Epoch 7/8
[1m16/16[0m [32m━━━━━━━━━━

In [2]:
# @title
# %% [markdown]
# # Parser MT950 în Google Colab
# **Obiectiv:** Construim un pipeline hibrid pentru mesaje SWIFT **MT950**:
# - Extragem câmpurile standard (:20:, :25:, :28C:, :60F/M:, :62F/M:, :64:) prin **reguli deterministe** (regex);
# - Antrenăm o **rețea neuronală** (BiLSTM pentru etichetare la nivel de caracter) care parsează înregistrările complicate **`:61:`** (data, D/C, sumă, cod, referință).
#
# Notebook-ul este self‑contained: are date sintetice de antrenare, dar poți încărca ușor propriile mesaje.
#
# ---

# %%
# !pip install tensorflow==2.16.1 scikit-learn pandas numpy regex==2024.4.16 --quiet

import re
import json
import math
import random
import numpy as np
import pandas as pd
from typing import List, Dict, Any

import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split

print(tf.__version__)

# %% [markdown]
# ## 1) Mesaje MT950 – date demo (poți înlocui cu ale tale)
#
# *Notă:* Structura MT950 e apropiată de MT940. În practică, etichetele pot varia ușor între bănci.

# %%
DEMO_MESSAGES = [
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924D23,45NTRFNONREF//REF1
        :61:2509240924C120,00NTRFNONREF//REF2
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01CCCCZZZZAXXX0000000000}{2:I950DDDDZZZZXXXXN}{4:
        :20:ABCD98765
        :25:GB12BARC20040123456789
        :28C:00002/001
        :60M:C250901USD5000,
        :61:2509020902C250,50NCHKNONREF//INV001
        :61:2509030903D75,00NCHKNONREF//INV002
        :62M:C250903USD5175,50
        :64:C250903USD5175,50
        -}
        """.strip()
    ),
    (
        """
        {1:F01EEEEZZZZAXXX0000000000}{2:I950FFFFZZZZXXXXN}{4:
        :20:XYZ000111
        :25:DE89370400440532013000
        :28C:00003/001
        :60F:C250930EUR100,00
        :61:2510011001C900,00NTRFNONREF//PAYR001
        :62F:C251001EUR1000,00
        :64:C251001EUR1000,00
        -}
        """.strip()
    ),
]

# %% [markdown]
# ## 2) Parser bazat pe reguli pentru tag-urile standard
# Extragem câmpuri cheie într-un dict. Înregistrările `:61:` le păstrăm separat pentru rețeaua neuronală.

# %%
TAG_PATTERN = re.compile(r"^:(\d{2}[A-Z]?):(.*)$")

balance_pat = re.compile(r"^(?P<sign>[CD])(?P<date>\d{6})(?P<ccy>[A-Z]{3})(?P<amt>[0-9,]+)$")

# Structura minimală pentru o linie :61:
#  YYMMDD[entry]D/CamountN<code><ref>//<ref2>
line61_pat = re.compile(r"^(?P<valdate>\d{6})(?P<entry>\d{4})?(?P<dc>[DC])(?P<amt>[0-9,]+)N(?P<tx>\w+)(?P<rest>.*)$")

def parse_mt950_rules(raw: str) -> Dict[str, Any]:
    lines = [ln.strip() for ln in raw.splitlines()]
    in_block4 = False
    tags = []
    current = None
    for ln in lines:
        if ln.startswith(":"):
            m = TAG_PATTERN.match(ln)
            if m:
                if current:
                    tags.append(current)
                current = {"tag": m.group(1), "value": m.group(2)}
            else:
                if current:
                    current["value"] += "\n" + ln
        elif ln.startswith("{4:"):
            in_block4 = True
        elif ln.startswith("-}"):
            in_block4 = False
            if current:
                tags.append(current)
                current = None
        else:
            if in_block4 and current:
                current["value"] += "\n" + ln
    if current:
        tags.append(current)

    result = {
        "transaction_reference": None,
        "account": None,
        "statement_number": None,
        "opening_balance": None,
        "closing_balance": None,
        "available_balance": None,
        "records_61": [],  # brut, pentru NN
    }

    for t in tags:
        tag = t["tag"].upper()
        val = t["value"].strip()
        if tag == "20":
            result["transaction_reference"] = val
        elif tag == "25":
            result["account"] = val
        elif tag == "28C":
            result["statement_number"] = val
        elif tag in ("60F", "60M"):
            m = balance_pat.match(val)
            if m:
                result["opening_balance"] = m.groupdict()
        elif tag in ("62F", "62M"):
            m = balance_pat.match(val)
            if m:
                result["closing_balance"] = m.groupdict()
        elif tag == "64":
            m = balance_pat.match(val)
            if m:
                result["available_balance"] = m.groupdict()
        elif tag == "61":
            # colectăm pentru NN (una pe linie)
            for sub in val.splitlines():
                sub = sub.strip()
                if sub:
                    result["records_61"].append(sub)
    return result

# Test rapid
for i, msg in enumerate(DEMO_MESSAGES, 1):
    parsed = parse_mt950_rules(msg)
    print(f"MSG {i}: acct={parsed['account']} open={parsed['opening_balance']} close={parsed['closing_balance']} 61_count={len(parsed['records_61'])}")

# %% [markdown]
# ## 3) Rețea neuronală pentru etichetarea caracterelor din `:61:`
#
# Vom face NER la nivel de **caracter** cu schema BIO pe următoarele entități:
# - `DATE` (YYMMDD), `ENTRY` (opțional), `DC` (D/C), `AMT` (sumă), `CODE` (ex. NTRF), `REF` (restul).
#
# Setul de antrenare e **sintetic** (bazat pe tipare uzuale). Înlocuiește/completează cu date reale anonimizate pentru rezultate mai bune.

# %%
LABELS = ["O", "B-DATE", "I-DATE", "B-ENTRY", "I-ENTRY", "B-DC", "I-DC", "B-AMT", "I-AMT", "B-CODE", "I-CODE", "B-REF", "I-REF"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Generator simplu de linii :61: și etichete BIO (caracter-level)

def synth61_samples(n=400):
    codes = ["NTRF","NCHQ","NCHK","NMSC","NCMZ"]
    refs  = ["NONREF//REF1","NONREF//INV001","PAYR001","FEES//F123","SAL//OCT"]
    samples = []
    for _ in range(n):
        yymmdd = f"25{random.randint(1,12):02d}{random.randint(1,28):02d}"
        entry  = f"{random.randint(1,12):02d}{random.randint(1,28):02d}"
        dc     = random.choice(["D","C"])
        amt    = f"{random.randint(1,2000)},{random.randint(0,99):02d}"
        code   = random.choice(codes)
        ref    = random.choice(refs)
        line   = f"{yymmdd}{entry}{dc}{amt}{code}{ref}"
        labels = ["O"]*len(line)
        def tag_span(start, end, b, i):
            labels[start] = b
            for k in range(start+1, end):
                labels[k] = i
        # mark spans determinist prin regex-uri
        m = re.match(r"(\d{6})(\d{4})([DC])([0-9,]+)(\w+)(.*)$", line)
        if not m:
            continue
        s0 = 0; e0 = 6
        s1 = 6; e1 = 10
        s2 = 10; e2 = 11
        # amount: după D/C până la terminarea cifrelor/virgulei
        amt_m = re.match(r"[0-9,]+", line[e2:])
        s3 = e2; e3 = e2 + (amt_m.end() if amt_m else 0)
        # code: [A-Za-z]+
        code_m = re.match(r"[A-Za-z]+", line[e3:])
        s4 = e3; e4 = e3 + (code_m.end() if code_m else 0)
        # rest = referință
        s5 = e4; e5 = len(line)

        tag_span(s0,e0,"B-DATE","I-DATE")
        tag_span(s1,e1,"B-ENTRY","I-ENTRY")
        tag_span(s2,e2,"B-DC","I-DC")
        tag_span(s3,e3,"B-AMT","I-AMT")
        tag_span(s4,e4,"B-CODE","I-CODE")
        if s5 < e5:
            tag_span(s5,e5,"B-REF","I-REF")
        samples.append((line, labels))
    return samples

samples = synth61_samples(600)
len(samples)

# %%
# Vocabular de caractere
chars = sorted({ch for s,_ in samples for ch in s} | set(" :/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.'+-_"))
char2id = {c:i+1 for i,c in enumerate(chars)}  # 0 = PAD
id2char = {i:c for c,i in char2id.items()}

MAX_LEN = max(len(s) for s,_ in samples)

X = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
Y = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
for i,(s,labels) in enumerate(samples):
    ids = [char2id.get(ch,0) for ch in s]
    X[i,:len(ids)] = ids
    Y[i,:len(labels)] = [label2id[l] for l in labels]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

# %% [markdown]
# ### Model: Embedding + BiLSTM + TimeDistributed

# %%
EMB = 64
HID = 96
NUM_LABELS = len(LABELS)
VOCAB = len(char2id) + 1

inp = layers.Input(shape=(MAX_LEN,), dtype="int32")
emb = layers.Embedding(VOCAB, EMB, mask_zero=True)(inp)
bi  = layers.Bidirectional(layers.LSTM(HID, return_sequences=True))(emb)
out = layers.TimeDistributed(layers.Dense(NUM_LABELS, activation="softmax"))(bi)
model = Model(inp, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# pregătim țintele ca (batch, seq, 1)
Y_train_ = np.expand_dims(Y_train, -1)
Y_val_   = np.expand_dims(Y_val, -1)

hist = model.fit(X_train, Y_train_, validation_data=(X_val, Y_val_), epochs=8, batch_size=32, verbose=1)

# %% [markdown]
# ## 4) Inference pentru o linie `:61:`

# %%

def predict_tags(line: str):
    arr = np.zeros((1, MAX_LEN), dtype=np.int32)
    ids = [char2id.get(ch,0) for ch in line]
    arr[0,:len(ids)] = ids
    probs = model.predict(arr, verbose=0)[0]
    pred = probs.argmax(-1)[:len(ids)]
    return [id2label[i] for i in pred]


def spans_from_bio(text: str, tags: List[str]) -> Dict[str, str]:
    out = {"DATE":"", "ENTRY":"", "DC":"", "AMT":"", "CODE":"", "REF":""}
    cur = None; buf = []
    def flush():
        nonlocal cur, buf
        if cur and buf:
            out[cur] += "".join(buf)
        cur = None; buf = []
    for ch, tg in zip(text, tags):
        if tg.startswith("B-"):
            flush()
            cur = tg[2:]
            buf = [ch]
        elif tg.startswith("I-") and cur == tg[2:]:
            buf.append(ch)
        else:
            flush()
    flush()
    for k in out:
        out[k] = out[k].strip()
    return out

# Test pe liniile :61: din mesajele demo
for msg in DEMO_MESSAGES:
    p = parse_mt950_rules(msg)
    for ln in p["records_61"]:
        tg = predict_tags(ln)
        spans = spans_from_bio(ln, tg)
        print(ln)
        print(spans)
        print("---")

# %% [markdown]
# ## 5) Asamblare end‑to‑end: din mesaj MT950 în JSON structurat

# %%

def enrich_61_with_nn_v2(records: List[str]) -> List[Dict[str, Any]]:
    """Versiune robustă: NN + reguli deterministe fără regex fragil.
    - YYMMDD (obligatoriu) + MMDD (opțional) sunt forțate din începutul liniei;
    - D/C + sumă sunt extrase prin parcurgere de caractere;
    - CODE = primele 4 litere după 'N'; restul = referință (fără prefixul '//').
    """
    def is_digits(s: str) -> bool:
        return all(ch.isdigit() for ch in s) and len(s) > 0

    def read_amount(s: str, start: int) -> (str, int):
        i = start
        buf = []
        while i < len(s) and (s[i].isdigit() or s[i] in ",."):
            buf.append(s[i]); i += 1
        return ("".join(buf), i)

    out: List[Dict[str, Any]] = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)

        # 1) DATE + ENTRY din capul liniei
        if len(ln) >= 6 and is_digits(ln[:6]):
            spans["DATE"] = ln[:6]
            if len(ln) >= 10 and is_digits(ln[6:10]):
                spans["ENTRY"] = ln[6:10]
            else:
                spans["ENTRY"] = spans.get("ENTRY", "")

        # 2) DC + AMT — găsim primul D/C după poziția 6
        pos_D = ln.find("D", 6)
        pos_C = ln.find("C", 6)
        candidates = [p for p in [pos_D, pos_C] if p != -1]
        dc_pos = min(candidates) if candidates else -1
        if dc_pos != -1:
            spans["DC"] = ln[dc_pos]
            amt, end_amt = read_amount(ln, dc_pos + 1)
            if amt:
                spans["AMT"] = amt
            after_amt = end_amt
        else:
            after_amt = 0

        # 3) CODE + REF — căutăm 'N' după sumă
        n_pos = ln.find("N", max(after_amt, 0))
        if n_pos != -1 and n_pos + 5 <= len(ln):
            cand = ln[n_pos + 1:n_pos + 5]
            if cand.isalpha() and cand.upper() == cand:
                spans["CODE"] = cand
                ref = ln[n_pos + 5:].strip()
                if ref.startswith("//"):
                    ref = ref[2:]
                spans["REF"] = ref or None

        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "reference": spans.get("REF")
        }
        out.append(item)
    return out

# %%

def parse_amount_to_float(s: str) -> float:
    s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except:
        return math.nan


def enrich_61_with_nn(records: List[str]) -> List[Dict[str, Any]]:
    out = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)
        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "reference": spans.get("REF") or None,
        }
        out.append(item)
    return out


def parse_mt950_full(raw: str) -> Dict[str, Any]:
    base = parse_mt950_rules(raw)
    tx = enrich_61_with_nn_v2(base["records_61"]) if base.get("records_61") else []
    base["transactions"] = tx
    return base

# Rulează pe toate mesajele demo
structured = [parse_mt950_full(m) for m in DEMO_MESSAGES]
print(json.dumps(structured, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 6) Cum încarci propriile tale mesaje
#
# - Dacă ai un **fișier** cu mai multe mesaje concatenate, citește-l și separă-le după delimiterul `-}`.
# - Alternativ, pune fiecare mesaj într-un element al unei liste și rulează `parse_mt950_full`.

# %%
# Exemplu de încărcare din fișier (decomentează și adaptează calea):
# with open('/content/mt950.txt', 'r', encoding='utf-8') as f:
#     raw_all = f.read()
# raw_msgs = [blk.strip()+"\n-}" for blk in raw_all.split('-}') if blk.strip()]
# parsed = [parse_mt950_full(m) for m in raw_msgs]
# print(json.dumps(parsed, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 7) Observații și extensii
# - **Antrenare pe date reale**: Calitatea parse‑ului la `:61:` depinde de datele de antrenare. Înlocuiește `synth61_samples` cu linii reale anonimizate.
# - **Câmpuri suplimentare**: Poți adăuga etichete BIO noi (ex. contrapartidă, text liber după `//`).
# - **Generalizare**: Pentru performanță mai bună, crește `epochs`, diversifică pattern‑urile și normalizează sumele/valutele.
# - **Validare**: Adaugă teste unitare pentru fiecare tag.
# - **Export**: Salvează JSON-ul rezultat în fișiere sau într-o bază de date.


2.19.0
MSG 1: acct=RO49AAAA1B31007593840000 open={'sign': 'C', 'date': '250923', 'ccy': 'EUR', 'amt': '1234,56'} close={'sign': 'C', 'date': '250924', 'ccy': 'EUR', 'amt': '1331,11'} 61_count=2
MSG 2: acct=GB12BARC20040123456789 open={'sign': 'C', 'date': '250901', 'ccy': 'USD', 'amt': '5000,'} close={'sign': 'C', 'date': '250903', 'ccy': 'USD', 'amt': '5175,50'} 61_count=2
MSG 3: acct=DE89370400440532013000 open={'sign': 'C', 'date': '250930', 'ccy': 'EUR', 'amt': '100,00'} close={'sign': 'C', 'date': '251001', 'ccy': 'EUR', 'amt': '1000,00'} 61_count=1


Epoch 1/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 64ms/step - accuracy: 0.4394 - loss: 2.4157 - val_accuracy: 0.5114 - val_loss: 1.6215
Epoch 2/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5928 - loss: 1.3923 - val_accuracy: 0.6796 - val_loss: 0.8866
Epoch 3/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6834 - loss: 0.7760 - val_accuracy: 0.7262 - val_loss: 0.5158
Epoch 4/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7335 - loss: 0.4613 - val_accuracy: 0.7849 - val_loss: 0.3255
Epoch 5/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8005 - loss: 0.3028 - val_accuracy: 0.8349 - val_loss: 0.2444
Epoch 6/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8457 - loss: 0.2200 - val_accuracy: 0.8620 - val_loss: 0.1640
Epoch 7/8
[1m16/16[0m [32m━━━━━━━━━━━

In [3]:
# @title
# %% [markdown]
# # Parser MT950 în Google Colab
# **Obiectiv:** Construim un pipeline hibrid pentru mesaje SWIFT **MT950**:
# - Extragem câmpurile standard (:20:, :25:, :28C:, :60F/M:, :62F/M:, :64:) prin **reguli deterministe** (regex);
# - Antrenăm o **rețea neuronală** (BiLSTM pentru etichetare la nivel de caracter) care parsează înregistrările complicate **`:61:`** (data, D/C, sumă, cod, referință).
#
# Notebook-ul este self‑contained: are date sintetice de antrenare, dar poți încărca ușor propriile mesaje.
#
# ---

# %%
# !pip install tensorflow==2.16.1 scikit-learn pandas numpy regex==2024.4.16 --quiet

import re
import json
import math
import random
import numpy as np
import pandas as pd
from typing import List, Dict, Any

import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split

print(tf.__version__)

# %% [markdown]
# ## 1) Mesaje MT950 – date demo (poți înlocui cu ale tale)
#
# *Notă:* Structura MT950 e apropiată de MT940. În practică, etichetele pot varia ușor între bănci.

# %%
DEMO_MESSAGES = [
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924D23,45NTRFNONREF//REF1
        :61:2509240924C120,00NTRFNONREF//REF2
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01CCCCZZZZAXXX0000000000}{2:I950DDDDZZZZXXXXN}{4:
        :20:ABCD98765
        :25:GB12BARC20040123456789
        :28C:00002/001
        :60M:C250901USD5000,
        :61:2509020902C250,50NCHKNONREF//INV001
        :61:2509030903D75,00NCHKNONREF//INV002
        :62M:C250903USD5175,50
        :64:C250903USD5175,50
        -}
        """.strip()
    ),
    (
        """
        {1:F01EEEEZZZZAXXX0000000000}{2:I950FFFFZZZZXXXXN}{4:
        :20:XYZ000111
        :25:DE89370400440532013000
        :28C:00003/001
        :60F:C250930EUR100,00
        :61:2510011001C900,00NTRFNONREF//PAYR001
        :62F:C251001EUR1000,00
        :64:C251001EUR1000,00
        -}
        """.strip()
    ),
]

# %% [markdown]
# ## 2) Parser bazat pe reguli pentru tag-urile standard
# Extragem câmpuri cheie într-un dict. Înregistrările `:61:` le păstrăm separat pentru rețeaua neuronală.

# %%
TAG_PATTERN = re.compile(r"^:(\d{2}[A-Z]?):(.*)$")

balance_pat = re.compile(r"^(?P<sign>[CD])(?P<date>\d{6})(?P<ccy>[A-Z]{3})(?P<amt>[0-9,]+)$")

# Structura minimală pentru o linie :61:
#  YYMMDD[entry]D/CamountN<code><ref>//<ref2>
line61_pat = re.compile(r"^(?P<valdate>\d{6})(?P<entry>\d{4})?(?P<dc>[DC])(?P<amt>[0-9,]+)N(?P<tx>\w+)(?P<rest>.*)$")

def parse_mt950_rules(raw: str) -> Dict[str, Any]:
    lines = [ln.strip() for ln in raw.splitlines()]
    in_block4 = False
    tags = []
    current = None
    for ln in lines:
        if ln.startswith(":"):
            m = TAG_PATTERN.match(ln)
            if m:
                if current:
                    tags.append(current)
                current = {"tag": m.group(1), "value": m.group(2)}
            else:
                if current:
                    current["value"] += "\n" + ln
        elif ln.startswith("{4:"):
            in_block4 = True
        elif ln.startswith("-}"):
            in_block4 = False
            if current:
                tags.append(current)
                current = None
        else:
            if in_block4 and current:
                current["value"] += "\n" + ln
    if current:
        tags.append(current)

    result = {
        "transaction_reference": None,
        "account": None,
        "statement_number": None,
        "opening_balance": None,
        "closing_balance": None,
        "available_balance": None,
        "records_61": [],  # brut, pentru NN
    }

    for t in tags:
        tag = t["tag"].upper()
        val = t["value"].strip()
        if tag == "20":
            result["transaction_reference"] = val
        elif tag == "25":
            result["account"] = val
        elif tag == "28C":
            result["statement_number"] = val
        elif tag in ("60F", "60M"):
            m = balance_pat.match(val)
            if m:
                result["opening_balance"] = m.groupdict()
        elif tag in ("62F", "62M"):
            m = balance_pat.match(val)
            if m:
                result["closing_balance"] = m.groupdict()
        elif tag == "64":
            m = balance_pat.match(val)
            if m:
                result["available_balance"] = m.groupdict()
        elif tag == "61":
            # colectăm pentru NN (una pe linie)
            for sub in val.splitlines():
                sub = sub.strip()
                if sub:
                    result["records_61"].append(sub)
    return result

# Test rapid
for i, msg in enumerate(DEMO_MESSAGES, 1):
    parsed = parse_mt950_rules(msg)
    print(f"MSG {i}: acct={parsed['account']} open={parsed['opening_balance']} close={parsed['closing_balance']} 61_count={len(parsed['records_61'])}")

# %% [markdown]
# ## 3) Rețea neuronală pentru etichetarea caracterelor din `:61:`
#
# Vom face NER la nivel de **caracter** cu schema BIO pe următoarele entități:
# - `DATE` (YYMMDD), `ENTRY` (opțional), `DC` (D/C), `AMT` (sumă), `CODE` (ex. NTRF), `REF` (restul).
#
# Setul de antrenare e **sintetic** (bazat pe tipare uzuale). Înlocuiește/completează cu date reale anonimizate pentru rezultate mai bune.

# %%
LABELS = ["O", "B-DATE", "I-DATE", "B-ENTRY", "I-ENTRY", "B-DC", "I-DC", "B-AMT", "I-AMT", "B-CODE", "I-CODE", "B-REF", "I-REF"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Generator simplu de linii :61: și etichete BIO (caracter-level)

def synth61_samples(n=400):
    codes = ["NTRF","NCHQ","NCHK","NMSC","NCMZ"]
    refs  = ["NONREF//REF1","NONREF//INV001","PAYR001","FEES//F123","SAL//OCT"]
    samples = []
    for _ in range(n):
        yymmdd = f"25{random.randint(1,12):02d}{random.randint(1,28):02d}"
        entry  = f"{random.randint(1,12):02d}{random.randint(1,28):02d}"
        dc     = random.choice(["D","C"])
        amt    = f"{random.randint(1,2000)},{random.randint(0,99):02d}"
        code   = random.choice(codes)
        ref    = random.choice(refs)
        line   = f"{yymmdd}{entry}{dc}{amt}{code}{ref}"
        labels = ["O"]*len(line)
        def tag_span(start, end, b, i):
            labels[start] = b
            for k in range(start+1, end):
                labels[k] = i
        # mark spans determinist prin regex-uri
        m = re.match(r"(\d{6})(\d{4})([DC])([0-9,]+)(\w+)(.*)$", line)
        if not m:
            continue
        s0 = 0; e0 = 6
        s1 = 6; e1 = 10
        s2 = 10; e2 = 11
        # amount: după D/C până la terminarea cifrelor/virgulei
        amt_m = re.match(r"[0-9,]+", line[e2:])
        s3 = e2; e3 = e2 + (amt_m.end() if amt_m else 0)
        # code: [A-Za-z]+
        code_m = re.match(r"[A-Za-z]+", line[e3:])
        s4 = e3; e4 = e3 + (code_m.end() if code_m else 0)
        # rest = referință
        s5 = e4; e5 = len(line)

        tag_span(s0,e0,"B-DATE","I-DATE")
        tag_span(s1,e1,"B-ENTRY","I-ENTRY")
        tag_span(s2,e2,"B-DC","I-DC")
        tag_span(s3,e3,"B-AMT","I-AMT")
        tag_span(s4,e4,"B-CODE","I-CODE")
        if s5 < e5:
            tag_span(s5,e5,"B-REF","I-REF")
        samples.append((line, labels))
    return samples

samples = synth61_samples(600)
len(samples)

# %%
# Vocabular de caractere
chars = sorted({ch for s,_ in samples for ch in s} | set(" :/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.'+-_"))
char2id = {c:i+1 for i,c in enumerate(chars)}  # 0 = PAD
id2char = {i:c for c,i in char2id.items()}

MAX_LEN = max(len(s) for s,_ in samples)

X = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
Y = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
for i,(s,labels) in enumerate(samples):
    ids = [char2id.get(ch,0) for ch in s]
    X[i,:len(ids)] = ids
    Y[i,:len(labels)] = [label2id[l] for l in labels]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

# %% [markdown]
# ### Model: Embedding + BiLSTM + TimeDistributed

# %%
EMB = 64
HID = 96
NUM_LABELS = len(LABELS)
VOCAB = len(char2id) + 1

inp = layers.Input(shape=(MAX_LEN,), dtype="int32")
emb = layers.Embedding(VOCAB, EMB, mask_zero=True)(inp)
bi  = layers.Bidirectional(layers.LSTM(HID, return_sequences=True))(emb)
out = layers.TimeDistributed(layers.Dense(NUM_LABELS, activation="softmax"))(bi)
model = Model(inp, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# pregătim țintele ca (batch, seq, 1)
Y_train_ = np.expand_dims(Y_train, -1)
Y_val_   = np.expand_dims(Y_val, -1)

hist = model.fit(X_train, Y_train_, validation_data=(X_val, Y_val_), epochs=8, batch_size=32, verbose=1)

# %% [markdown]
# ## 4) Inference pentru o linie `:61:`

# %%

def predict_tags(line: str):
    arr = np.zeros((1, MAX_LEN), dtype=np.int32)
    ids = [char2id.get(ch,0) for ch in line]
    arr[0,:len(ids)] = ids
    probs = model.predict(arr, verbose=0)[0]
    pred = probs.argmax(-1)[:len(ids)]
    return [id2label[i] for i in pred]


def spans_from_bio(text: str, tags: List[str]) -> Dict[str, str]:
    out = {"DATE":"", "ENTRY":"", "DC":"", "AMT":"", "CODE":"", "REF":""}
    cur = None; buf = []
    def flush():
        nonlocal cur, buf
        if cur and buf:
            out[cur] += "".join(buf)
        cur = None; buf = []
    for ch, tg in zip(text, tags):
        if tg.startswith("B-"):
            flush()
            cur = tg[2:]
            buf = [ch]
        elif tg.startswith("I-") and cur == tg[2:]:
            buf.append(ch)
        else:
            flush()
    flush()
    for k in out:
        out[k] = out[k].strip()
    return out

# Test pe liniile :61: din mesajele demo
for msg in DEMO_MESSAGES:
    p = parse_mt950_rules(msg)
    for ln in p["records_61"]:
        tg = predict_tags(ln)
        spans = spans_from_bio(ln, tg)
        print(ln)
        print(spans)
        print("---")

# %% [markdown]
# ## 5) Asamblare end‑to‑end: din mesaj MT950 în JSON structurat

# %%

def enrich_61_with_nn_v2(records: List[str]) -> List[Dict[str, Any]]:
    """Versiune robustă: NN + reguli deterministe fără regex fragil.
    - YYMMDD (obligatoriu) + MMDD (opțional) sunt forțate din începutul liniei;
    - D/C + sumă sunt extrase prin parcurgere de caractere;
    - CODE = primele 4 litere după 'N'; restul = referință (fără prefixul '//').
    """
    def is_digits(s: str) -> bool:
        return all(ch.isdigit() for ch in s) and len(s) > 0

    def read_amount(s: str, start: int) -> (str, int):
        i = start
        buf = []
        while i < len(s) and (s[i].isdigit() or s[i] in ",."):
            buf.append(s[i]); i += 1
        return ("".join(buf), i)

    out: List[Dict[str, Any]] = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)

        # 1) DATE + ENTRY din capul liniei
        if len(ln) >= 6 and is_digits(ln[:6]):
            spans["DATE"] = ln[:6]
            if len(ln) >= 10 and is_digits(ln[6:10]):
                spans["ENTRY"] = ln[6:10]
            else:
                spans["ENTRY"] = spans.get("ENTRY", "")

        # 2) DC + AMT — găsim primul D/C după poziția 6
        pos_D = ln.find("D", 6)
        pos_C = ln.find("C", 6)
        candidates = [p for p in [pos_D, pos_C] if p != -1]
        dc_pos = min(candidates) if candidates else -1
        if dc_pos != -1:
            spans["DC"] = ln[dc_pos]
            amt, end_amt = read_amount(ln, dc_pos + 1)
            if amt:
                spans["AMT"] = amt
            after_amt = end_amt
        else:
            after_amt = 0

        # 3) CODE + REF — căutăm 'N' după sumă
        n_pos = ln.find("N", max(after_amt, 0))
        if n_pos != -1 and n_pos + 4 <= len(ln):
            # conform standardului MT :61:, codul are 3 litere dupa 'N' (3!c)
            cand3 = ln[n_pos + 1:n_pos + 4]
            if cand3.isalpha() and cand3.upper() == cand3:
                # pastram atat varianta de 3 litere cat si forma uzuala cu 'N' in fata
                spans["CODE3"] = cand3
                spans["CODE"] = "N" + cand3
                ref = ln[n_pos + 4:].strip()
                if ref.startswith("//"):
                    ref = ref[2:]
                spans["REF"] = ref or None

        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "code3": spans.get("CODE3") or None,
            "reference": spans.get("REF")
        }
        out.append(item)
    return out

# %%

def parse_amount_to_float(s: str) -> float:
    s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except:
        return math.nan


def enrich_61_with_nn(records: List[str]) -> List[Dict[str, Any]]:
    out = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)
        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "reference": spans.get("REF") or None,
        }
        out.append(item)
    return out


def parse_mt950_full(raw: str) -> Dict[str, Any]:
    base = parse_mt950_rules(raw)
    tx = enrich_61_with_nn_v2(base["records_61"]) if base.get("records_61") else []
    base["transactions"] = tx
    return base

# Rulează pe toate mesajele demo
structured = [parse_mt950_full(m) for m in DEMO_MESSAGES]
print(json.dumps(structured, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 6) Cum încarci propriile tale mesaje
#
# - Dacă ai un **fișier** cu mai multe mesaje concatenate, citește-l și separă-le după delimiterul `-}`.
# - Alternativ, pune fiecare mesaj într-un element al unei liste și rulează `parse_mt950_full`.

# %%
# Exemplu de încărcare din fișier (decomentează și adaptează calea):
# with open('/content/mt950.txt', 'r', encoding='utf-8') as f:
#     raw_all = f.read()
# raw_msgs = [blk.strip()+"\n-}" for blk in raw_all.split('-}') if blk.strip()]
# parsed = [parse_mt950_full(m) for m in raw_msgs]
# print(json.dumps(parsed, indent=2, ensure_ascii=False))

# %% [markdown]
# ## 7) Observații și extensii
# - **Antrenare pe date reale**: Calitatea parse‑ului la `:61:` depinde de datele de antrenare. Înlocuiește `synth61_samples` cu linii reale anonimizate.
# - **Câmpuri suplimentare**: Poți adăuga etichete BIO noi (ex. contrapartidă, text liber după `//`).
# - **Generalizare**: Pentru performanță mai bună, crește `epochs`, diversifică pattern‑urile și normalizează sumele/valutele.
# - **Validare**: Adaugă teste unitare pentru fiecare tag.
# - **Export**: Salvează JSON-ul rezultat în fișiere sau într-o bază de date.

2.19.0
MSG 1: acct=RO49AAAA1B31007593840000 open={'sign': 'C', 'date': '250923', 'ccy': 'EUR', 'amt': '1234,56'} close={'sign': 'C', 'date': '250924', 'ccy': 'EUR', 'amt': '1331,11'} 61_count=2
MSG 2: acct=GB12BARC20040123456789 open={'sign': 'C', 'date': '250901', 'ccy': 'USD', 'amt': '5000,'} close={'sign': 'C', 'date': '250903', 'ccy': 'USD', 'amt': '5175,50'} 61_count=2
MSG 3: acct=DE89370400440532013000 open={'sign': 'C', 'date': '250930', 'ccy': 'EUR', 'amt': '100,00'} close={'sign': 'C', 'date': '251001', 'ccy': 'EUR', 'amt': '1000,00'} 61_count=1


Epoch 1/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 65ms/step - accuracy: 0.3995 - loss: 2.4143 - val_accuracy: 0.4951 - val_loss: 1.4985
Epoch 2/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5763 - loss: 1.2973 - val_accuracy: 0.6793 - val_loss: 0.7945
Epoch 3/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6879 - loss: 0.6963 - val_accuracy: 0.7173 - val_loss: 0.4574
Epoch 4/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7517 - loss: 0.4076 - val_accuracy: 0.7957 - val_loss: 0.3181
Epoch 5/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8205 - loss: 0.2788 - val_accuracy: 0.8457 - val_loss: 0.2095
Epoch 6/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8497 - loss: 0.1930 - val_accuracy: 0.8509 - val_loss: 0.1600
Epoch 7/8
[1m16/16[0m [32m━━━━━━━━━━━

In [27]:
import re
import json
import math
import random
import numpy as np
import pandas as pd
from typing import List, Dict, Any

import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
import json


print(tf.__version__)

2.19.0


In [7]:
DEMO_MESSAGES = [
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924D23,45NTRFNONREF//REF1
        :61:2509240924C120,00NTRFNONREF//REF2
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01CCCCZZZZAXXX0000000000}{2:I950DDDDZZZZXXXXN}{4:
        :20:ABCD98765
        :25:GB12BARC20040123456789
        :28C:00002/001
        :60M:C250901USD5000,
        :61:2509020902C250,50NCHKNONREF//INV001
        :61:2509030903D75,00NCHKNONREF//INV002
        :62M:C250903USD5175,50
        :64:C250903USD5175,50
        -}
        """.strip()
    ),
    (
        """
        {1:F01EEEEZZZZAXXX0000000000}{2:I950FFFFZZZZXXXXN}{4:
        :20:XYZ000111
        :25:DE89370400440532013000
        :28C:00003/001
        :60F:C250930EUR100,00
        :61:2510011001C900,00NTRFNONREF//PAYR001
        :62F:C251001EUR1000,00
        :64:C251001EUR1000,00
        -}
        """.strip()
    ),
]

In [8]:
# ## 2) Parser bazat pe reguli pentru tag-urile standard
# Extragem câmpuri cheie într-un dict. Înregistrările `:61:` le păstrăm separat pentru rețeaua neuronală.

# %%
TAG_PATTERN = re.compile(r"^:(\d{2}[A-Z]?):(.*)$")

balance_pat = re.compile(r"^(?P<sign>[CD])(?P<date>\d{6})(?P<ccy>[A-Z]{3})(?P<amt>[0-9,]+)$")

# Structura minimală pentru o linie :61:
#  YYMMDD[entry]D/CamountN<code><ref>//<ref2>
line61_pat = re.compile(r"^(?P<valdate>\d{6})(?P<entry>\d{4})?(?P<dc>[DC])(?P<amt>[0-9,]+)N(?P<tx>\w+)(?P<rest>.*)$")

In [9]:
def parse_mt950_rules(raw: str) -> Dict[str, Any]:
    lines = [ln.strip() for ln in raw.splitlines()]
    in_block4 = False
    tags = []
    current = None
    for ln in lines:
        if ln.startswith(":"):
            m = TAG_PATTERN.match(ln)
            if m:
                if current:
                    tags.append(current)
                current = {"tag": m.group(1), "value": m.group(2)}
            else:
                if current:
                    current["value"] += "\n" + ln
        elif ln.startswith("{4:"):
            in_block4 = True
        elif ln.startswith("-}"):
            in_block4 = False
            if current:
                tags.append(current)
                current = None
        else:
            if in_block4 and current:
                current["value"] += "\n" + ln
    if current:
        tags.append(current)

    result = {
        "transaction_reference": None,
        "account": None,
        "statement_number": None,
        "opening_balance": None,
        "closing_balance": None,
        "available_balance": None,
        "records_61": [],  # brut, pentru NN
    }

    for t in tags:
        tag = t["tag"].upper()
        val = t["value"].strip()
        if tag == "20":
            result["transaction_reference"] = val
        elif tag == "25":
            result["account"] = val
        elif tag == "28C":
            result["statement_number"] = val
        elif tag in ("60F", "60M"):
            m = balance_pat.match(val)
            if m:
                result["opening_balance"] = m.groupdict()
        elif tag in ("62F", "62M"):
            m = balance_pat.match(val)
            if m:
                result["closing_balance"] = m.groupdict()
        elif tag == "64":
            m = balance_pat.match(val)
            if m:
                result["available_balance"] = m.groupdict()
        elif tag == "61":
            # colectăm pentru NN (una pe linie)
            for sub in val.splitlines():
                sub = sub.strip()
                if sub:
                    result["records_61"].append(sub)
    return result

In [10]:
for i, msg in enumerate(DEMO_MESSAGES, 1):
    parsed = parse_mt950_rules(msg)
    print(f"MSG {i}: acct={parsed['account']} open={parsed['opening_balance']} close={parsed['closing_balance']} 61_count={len(parsed['records_61'])}")

LABELS = ["O", "B-DATE", "I-DATE", "B-ENTRY", "I-ENTRY", "B-DC", "I-DC", "B-AMT", "I-AMT", "B-CODE", "I-CODE", "B-REF", "I-REF"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

MSG 1: acct=RO49AAAA1B31007593840000 open={'sign': 'C', 'date': '250923', 'ccy': 'EUR', 'amt': '1234,56'} close={'sign': 'C', 'date': '250924', 'ccy': 'EUR', 'amt': '1331,11'} 61_count=2
MSG 2: acct=GB12BARC20040123456789 open={'sign': 'C', 'date': '250901', 'ccy': 'USD', 'amt': '5000,'} close={'sign': 'C', 'date': '250903', 'ccy': 'USD', 'amt': '5175,50'} 61_count=2
MSG 3: acct=DE89370400440532013000 open={'sign': 'C', 'date': '250930', 'ccy': 'EUR', 'amt': '100,00'} close={'sign': 'C', 'date': '251001', 'ccy': 'EUR', 'amt': '1000,00'} 61_count=1


In [11]:

def synth61_samples(n=400):
    codes = ["NTRF","NCHQ","NCHK","NMSC","NCMZ"]
    refs  = ["NONREF//REF1","NONREF//INV001","PAYR001","FEES//F123","SAL//OCT"]
    samples = []
    for _ in range(n):
        yymmdd = f"25{random.randint(1,12):02d}{random.randint(1,28):02d}"
        entry  = f"{random.randint(1,12):02d}{random.randint(1,28):02d}"
        dc     = random.choice(["D","C"])
        amt    = f"{random.randint(1,2000)},{random.randint(0,99):02d}"
        code   = random.choice(codes)
        ref    = random.choice(refs)
        line   = f"{yymmdd}{entry}{dc}{amt}{code}{ref}"
        labels = ["O"]*len(line)
        def tag_span(start, end, b, i):
            labels[start] = b
            for k in range(start+1, end):
                labels[k] = i
        # mark spans determinist prin regex-uri
        m = re.match(r"(\d{6})(\d{4})([DC])([0-9,]+)(\w+)(.*)$", line)
        if not m:
            continue
        s0 = 0; e0 = 6
        s1 = 6; e1 = 10
        s2 = 10; e2 = 11
        # amount: după D/C până la terminarea cifrelor/virgulei
        amt_m = re.match(r"[0-9,]+", line[e2:])
        s3 = e2; e3 = e2 + (amt_m.end() if amt_m else 0)
        # code: [A-Za-z]+
        code_m = re.match(r"[A-Za-z]+", line[e3:])
        s4 = e3; e4 = e3 + (code_m.end() if code_m else 0)
        # rest = referință
        s5 = e4; e5 = len(line)

        tag_span(s0,e0,"B-DATE","I-DATE")
        tag_span(s1,e1,"B-ENTRY","I-ENTRY")
        tag_span(s2,e2,"B-DC","I-DC")
        tag_span(s3,e3,"B-AMT","I-AMT")
        tag_span(s4,e4,"B-CODE","I-CODE")
        if s5 < e5:
            tag_span(s5,e5,"B-REF","I-REF")
        samples.append((line, labels))
    return samples

In [12]:
samples = synth61_samples(600)
len(samples)

chars = sorted({ch for s,_ in samples for ch in s} | set(" :/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.'+-_"))
char2id = {c:i+1 for i,c in enumerate(chars)}  # 0 = PAD
id2char = {i:c for c,i in char2id.items()}

MAX_LEN = max(len(s) for s,_ in samples)

X = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
Y = np.zeros((len(samples), MAX_LEN), dtype=np.int32)
for i,(s,labels) in enumerate(samples):
    ids = [char2id.get(ch,0) for ch in s]
    X[i,:len(ids)] = ids
    Y[i,:len(labels)] = [label2id[l] for l in labels]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

In [13]:
EMB = 64
HID = 96
NUM_LABELS = len(LABELS)
VOCAB = len(char2id) + 1

inp = layers.Input(shape=(MAX_LEN,), dtype="int32")
emb = layers.Embedding(VOCAB, EMB, mask_zero=True)(inp)
bi  = layers.Bidirectional(layers.LSTM(HID, return_sequences=True))(emb)
out = layers.TimeDistributed(layers.Dense(NUM_LABELS, activation="softmax"))(bi)
model = Model(inp, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# pregătim țintele ca (batch, seq, 1)
Y_train_ = np.expand_dims(Y_train, -1)
Y_val_   = np.expand_dims(Y_val, -1)

hist = model.fit(X_train, Y_train_, validation_data=(X_val, Y_val_), epochs=8, batch_size=32, verbose=1)

Epoch 1/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 91ms/step - accuracy: 0.4055 - loss: 2.4241 - val_accuracy: 0.4843 - val_loss: 1.6107
Epoch 2/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5694 - loss: 1.3968 - val_accuracy: 0.6781 - val_loss: 0.8290
Epoch 3/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.6881 - loss: 0.7271 - val_accuracy: 0.7321 - val_loss: 0.4745
Epoch 4/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7389 - loss: 0.4234 - val_accuracy: 0.8028 - val_loss: 0.3036
Epoch 5/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8046 - loss: 0.2828 - val_accuracy: 0.8586 - val_loss: 0.2139
Epoch 6/8
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8438 - loss: 0.2034 - val_accuracy: 0.8722 - val_loss: 0.1525
Epoch 7/8
[1m16/16[0m [32m━━━━━━━━━━━

In [14]:
def predict_tags(line: str):
    arr = np.zeros((1, MAX_LEN), dtype=np.int32)
    ids = [char2id.get(ch,0) for ch in line]
    arr[0,:len(ids)] = ids
    probs = model.predict(arr, verbose=0)[0]
    pred = probs.argmax(-1)[:len(ids)]
    return [id2label[i] for i in pred]

In [15]:
def spans_from_bio(text: str, tags: List[str]) -> Dict[str, str]:
    out = {"DATE":"", "ENTRY":"", "DC":"", "AMT":"", "CODE":"", "REF":""}
    cur = None; buf = []
    def flush():
        nonlocal cur, buf
        if cur and buf:
            out[cur] += "".join(buf)
        cur = None; buf = []
    for ch, tg in zip(text, tags):
        if tg.startswith("B-"):
            flush()
            cur = tg[2:]
            buf = [ch]
        elif tg.startswith("I-") and cur == tg[2:]:
            buf.append(ch)
        else:
            flush()
    flush()
    for k in out:
        out[k] = out[k].strip()
    return out

In [16]:
for msg in DEMO_MESSAGES:
    p = parse_mt950_rules(msg)
    for ln in p["records_61"]:
        tg = predict_tags(ln)
        spans = spans_from_bio(ln, tg)
        print(ln)
        print(spans)
        print("---")

2509240924D23,45NTRFNONREF//REF1
{'DATE': '250924', 'ENTRY': '', 'DC': 'D', 'AMT': '', 'CODE': 'NTRFNONREF', 'REF': '//REF1'}
---
2509240924C120,00NTRFNONREF//REF2
{'DATE': '250924', 'ENTRY': '0924', 'DC': 'C', 'AMT': '', 'CODE': 'NTRFNONREF', 'REF': '//REF2'}
---
2509020902C250,50NCHKNONREF//INV001
{'DATE': '250902', 'ENTRY': '0902', 'DC': 'C', 'AMT': '250,50', 'CODE': 'NCHKNONREF', 'REF': '//INV001'}
---
2509030903D75,00NCHKNONREF//INV002
{'DATE': '25090', 'ENTRY': '30903', 'DC': 'D', 'AMT': '', 'CODE': 'NCHKNONREF', 'REF': '//INV002'}
---
2510011001C900,00NTRFNONREF//PAYR001
{'DATE': '251001', 'ENTRY': '1001', 'DC': 'C', 'AMT': '900,00', 'CODE': 'NTRFNONREF/', 'REF': ''}
---


In [1]:
def enrich_61_with_nn_v2(records: List[str]) -> List[Dict[str, Any]]:
    """Versiune robustă: NN + reguli deterministe fără regex fragil.
    - YYMMDD (obligatoriu) + MMDD (opțional) sunt forțate din începutul liniei;
    - D/C + sumă sunt extrase prin parcurgere de caractere;
    - CODE = primele 4 litere după 'N'; restul = referință (fără prefixul '//').
    """
    def is_digits(s: str) -> bool:
        return all(ch.isdigit() for ch in s) and len(s) > 0

    def read_amount(s: str, start: int) -> (str, int):
        i = start
        buf = []
        while i < len(s) and (s[i].isdigit() or s[i] in ",."):
            buf.append(s[i]); i += 1
        return ("".join(buf), i)

    out: List[Dict[str, Any]] = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)

        # 1) DATE + ENTRY din capul liniei
        if len(ln) >= 6 and is_digits(ln[:6]):
            spans["DATE"] = ln[:6]
            if len(ln) >= 10 and is_digits(ln[6:10]):
                spans["ENTRY"] = ln[6:10]
            else:
                spans["ENTRY"] = spans.get("ENTRY", "")

        # 2) DC + AMT — găsim primul D/C după poziția 6
        pos_D = ln.find("D", 6)
        pos_C = ln.find("C", 6)
        candidates = [p for p in [pos_D, pos_C] if p != -1]
        dc_pos = min(candidates) if candidates else -1
        if dc_pos != -1:
            spans["DC"] = ln[dc_pos]
            amt, end_amt = read_amount(ln, dc_pos + 1)
            if amt:
                spans["AMT"] = amt
            after_amt = end_amt
        else:
            after_amt = 0

        # 3) CODE + REF — căutăm 'N' după sumă
        n_pos = ln.find("N", max(after_amt, 0))
        if n_pos != -1 and n_pos + 4 <= len(ln):
            # conform standardului MT :61:, codul are 3 litere dupa 'N' (3!c)
            cand3 = ln[n_pos + 1:n_pos + 4]
            if cand3.isalpha() and cand3.upper() == cand3:
                spans["CODE3"] = cand3
                spans["CODE"] = "N" + cand3
                ref_full = ln[n_pos + 4:].strip()
                if ref_full.startswith("//"):
                    ref_full = ref_full[2:]
                # split in reference and own_reference by first '//'
                if "//" in ref_full:
                    left, right = ref_full.split("//", 1)
                    spans["REF"] = left.strip() or None
                    spans["OWN_REF"] = right.strip() or None
                else:
                    spans["REF"] = ref_full or None
                    spans["OWN_REF"] = None

        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "code3": spans.get("CODE3") or None,
            "reference": spans.get("REF"),
            "own_reference": spans.get("OWN_REF")
        }
        out.append(item)
    return out


NameError: name 'List' is not defined

In [24]:
def parse_amount_to_float(s: str) -> float:
    s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except:
        return math.nan

In [25]:
def enrich_61_with_nn(records: List[str]) -> List[Dict[str, Any]]:
    out = []
    for ln in records:
        tags = predict_tags(ln)
        spans = spans_from_bio(ln, tags)
        item = {
            "raw": ln,
            "value_date": spans.get("DATE") or None,
            "entry_date": spans.get("ENTRY") or None,
            "dc": spans.get("DC") or None,
            "amount": parse_amount_to_float(spans.get("AMT") or ""),
            "code": spans.get("CODE") or None,
            "reference": spans.get("REF") or None,
        }
        out.append(item)
    return out

In [33]:
def parse_mt950_full(raw: str) -> Dict[str, Any]:
  base = parse_mt950_rules(raw)
  tx = enrich_61_with_nn_v2(base["records_61"]) if base.get("records_61") else []
  base["transactions"] = tx
  return base

In [None]:
structured = [parse_mt950_full(m) for m in DEMO_MESSAGES]
print(json.dumps(structured, indent=2, ensure_ascii=False))

In [38]:
DEMO_MESSAGES_TEST = [
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924D23,45NTRFFT434E434//REFcsd1
        :61:2509240924C120,00NTRFFT434EFW//REsdvF2
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01AAAAZZZZAXXX0000000000}{2:I950BBBBZZZZXXXXN}{4:
        :20:REF123456
        :25:RO49AAAA1B31007593840000
        :28C:00001/001
        :60F:C250923EUR1234,56
        :61:2509240924C5000,NTRFNONREF//BIGTRANS1
        :61:2509240924D999,99NTRFINCOM//EXTRAREF
        :62F:C250924EUR1331,11
        :64:C250924EUR1331,11
        -}
        """.strip()
    ),
    (
        """
        {1:F01CCCCZZZZAXXX0000000000}{2:I950DDDDZZZZXXXXN}{4:
        :20:ABCD98765
        :25:GB12BARC20040123456789
        :28C:00002/001
        :60M:C250901USD5000,
        :61:2509240924C450,55NCHKNONREF//INV001
        :61:2509240924D70,00NCMSPOS//POSREFX1
        :62M:C250903USD5175,50
        :64:C250903USD5175,50
        -}
        """.strip()
    )
]

structuredTest = [parse_mt950_full(m) for m in DEMO_MESSAGES_TEST]
print(json.dumps(structuredTest, indent=2, ensure_ascii=False))


ValueError: could not broadcast input array from shape (38,) into shape (36,)