In [69]:
"""
Condensation‑polymer simulation & minimal‑segment analysis
(反転同一視バージョン)
"""

import random
from collections import Counter
from typing import List, Tuple

# ────────────── 0. PARAMETERS ──────────────
N_CHAINS  = 10000
CHAIN_LEN = 100

feed_ratios = {'A': 0.5, 'B': 0.25, 'C': 0.2, 'D': 0.05}

bias_end_b = {'A': 1.0, 'C': 2.0, 'D': 2.0}      # 末端 b
bias_end_a = {'A_rev': 1.0, 'B': 2.0}            # 末端 a

MONOMER_DEF = {
    'A'     : ('a', 'b'),
    'A_rev' : ('b', 'a'),
    'B'     : ('b', 'b'),
    'C'     : ('a', 'a'),
    'D'     : ('a', 'a'),
}
UPPER_SET = {'A', 'B', 'C', 'D'}

# ────────────── 1. CHAIN GROWTH ──────────────
def _pick(cands: List[str], weights: List[float]) -> str:
    r, acc = random.random() * sum(weights), 0.0
    for c, w in zip(cands, weights):
        acc += w
        if r <= acc:
            return c
    return cands[-1]

def grow_chain() -> Tuple[List[str], List[bool]]:
    first = random.choices(
        ['A','B','C','D'],
        weights=[feed_ratios[m] for m in ['A','B','C','D']]
    )[0]
    chain, ori = [first], [False]
    end_fg = MONOMER_DEF[first][1]

    while len(chain) < CHAIN_LEN:
        if end_fg == 'b':
            cand = ['A','C','D']
            base = [feed_ratios[m] * bias_end_b[m] for m in cand]
        else:                  # end_fg == 'a'
            cand = ['A_rev','B']
            base = [feed_ratios['A'] * bias_end_a['A_rev'],
                    feed_ratios['B'] * bias_end_a['B']]
        ch = _pick(cand, base)
        chain.append('A' if ch == 'A_rev' else ch)
        ori  .append(ch == 'A_rev')
        end_fg = MONOMER_DEF[ch][1]
    return chain, ori

# ────────────── 2. TOKENISATION ──────────────
def chain_to_tokens(chain: List[str], ori: List[bool]) -> List[str]:
    toks = []
    for m, rev in zip(chain, ori):
        tag = 'A_rev' if (m == 'A' and rev) else m
        l, r = MONOMER_DEF[tag]
        toks += [l, m, r]
    return toks

# ────────────── 2.5  CANONICALISATION ──────────────
def canonicalize(tokens: List[str]) -> str:
    """正順と逆順のうち辞書順で小さい文字列を採用"""
    fwd = '-'.join(tokens)
    rev = '-'.join(tokens[::-1])
    return min(fwd, rev)

# ────────────── 3. MINIMAL‑SEGMENT EXTRACTION ──────────────
def extract_segments(tokens: List[str]) -> List[str]:
    segs, i, N = [], 0, len(tokens)
    while i < N - 3:                           # 最低でも a A … A b で長さ≥4
        if tokens[i] not in {'a','b'}:         # 開始は小文字
            i += 1; continue
        if tokens[i+1] not in UPPER_SET:       # ★開始直後は大文字
            i += 1; continue

        start, want = i, ('b' if tokens[i]=='a' else 'a')
        upper_cnt, j = 0, i+1
        found = False
        while j < N:
            t = tokens[j]
            if t in UPPER_SET:
                upper_cnt += 1
            if (t == want and upper_cnt >= 2 and tokens[j-1] in UPPER_SET):
                seg = tokens[start:j+1]
                segs.append(canonicalize(seg))
                i = j                      # 次探索は区切り文字から
                found = True
                break
            j += 1
        if not found:
            break
    return segs

# ────────────── 4. SIMULATION ──────────────
random.seed(None)
all_segments = Counter()
mono_totals  = Counter()

for _ in range(N_CHAINS):
    chain, ori = grow_chain()
    mono_totals.update(chain)
    segs = extract_segments(chain_to_tokens(chain, ori))
    all_segments.update(segs)

# ────────────── 5. OUTPUT ──────────────
print("── Monomer composition check ──")
avg = {m: mono_totals[m] / (N_CHAINS * CHAIN_LEN) for m in ['A', 'B', 'C', 'D']}
for m in ['A', 'B', 'C', 'D']:
    print(f"{m}: expected {feed_ratios[m]:.2f}  |  observed {avg[m]:.4f}")

print("\n── Pattern probabilities (cumulative ≥ 99 %) ──")
total_seg = sum(all_segments.values())

cum = 0.0
for seg, n in all_segments.most_common():          # 件数制限を外す
    p = n / total_seg
    cum += p
    print(f"{seg:55s} : {n:8d}  ({p:6.3%})  cum={cum:6.3%}")
    if cum >= 0.99:                                # 99 % を超えたら終了
        break


── Monomer composition check ──
A: expected 0.50  |  observed 0.5000
B: expected 0.25  |  observed 0.2488
C: expected 0.20  |  observed 0.2009
D: expected 0.05  |  observed 0.0503

── Pattern probabilities (cumulative ≥ 99 %) ──
a-A-b-a-A-b                                             :    83192  (25.157%)  cum=25.157%
a-C-a-b-B-b                                             :    66549  (20.124%)  cum=45.281%
a-C-a-b-A-a-b-B-b                                       :    33182  (10.034%)  cum=55.315%
a-A-b-a-C-a-b-B-b                                       :    25015  (7.564%)  cum=62.879%
a-D-a-b-B-b                                             :    16695  (5.048%)  cum=67.928%
a-C-a-b-A-a-b-A-a-b-B-b                                 :    16491  (4.987%)  cum=72.915%
a-A-b-a-C-a-b-A-a-b-B-b                                 :    12426  (3.758%)  cum=76.672%
a-D-a-b-A-a-b-B-b                                       :     8303  (2.511%)  cum=79.183%
a-C-a-b-B-b-a-A-b                               

In [70]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
polysegment2smiles_working.py
-----------------------------
(1) 事前に集計済みの segment と出現確率を受け取り
(2) segment → モノマー列（向きを保持）
(3) “カルボン酸 + フェノール OH → エステル” のみを順次縮合
(4) 95 % 累積までのパターンを SMILES 付きで表示
"""

from rdkit import Chem
from rdkit.Chem import AllChem, rdChemReactions
from collections import Counter
import pandas as pd

# ──────────────────────────────────────────────
# 0. モノマー定義（向きも別キーで保持）
# ──────────────────────────────────────────────
MONOMERS = {
    # A  (非対称)
    "a-A-b": {"smiles": "O=C(O)c1ccc(O)cc1", "ends": ["COOH", "PhOH"]},
    "b-A-a": {"smiles": "O=C(O)c1ccc(O)cc1", "ends": ["PhOH", "COOH"]},
    # B  (フェノール‐フェノール)
    "b-B-b": {"smiles": "Oc1ccc(O)cc1",       "ends": ["PhOH", "PhOH"]},
    # C / D  (酸‐酸)
    "a-C-a": {"smiles": "O=C(O)c1ccc(C(=O)O)cc1",  "ends": ["COOH", "COOH"]},
    "a-D-a": {"smiles": "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O", "ends": ["COOH", "COOH"]},
}

# ──────────────────────────────────────────────
# 1. “酸 + フェノール OH → エステル” だけを許可
#    H1 属性 + マップ番号は末尾  の順に書く
# ──────────────────────────────────────────────
rxn_phen_ester = rdChemReactions.ReactionFromSmarts(
    "[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]"
)

# ──────────────────────────────────────────────
# 2. 集計済み segment (累積95 % 分)
# ──────────────────────────────────────────────
SEG_FREQ = [
    ("a-A-b-a-A-b",                         0.2530),
    ("a-C-a-b-B-b",                         0.2018),
    ("a-C-a-b-A-a-b-B-b",                   0.1001),
    ("a-A-b-a-C-a-b-B-b",                   0.0758),
    ("a-D-a-b-B-b",                         0.0505),
    ("a-C-a-b-A-a-b-A-a-b-B-b",             0.0493),
    ("a-A-b-a-C-a-b-A-a-b-B-b",             0.0373),
    ("a-D-a-b-A-a-b-B-b",                   0.0250),
    ("a-C-a-b-A-a-b-A-a-b-A-a-b-B-b",       0.0244),
    ("a-C-a-b-B-b-a-A-b",                   0.0242),
    ("a-A-b-a-D-a-b-B-b",                   0.0190),
]

# ──────────────────────────────────────────────
# 3. segment → モノマー列（向きを保持）
# ──────────────────────────────────────────────
def seg_to_monomers(seg: str):
    t = seg.split('-')
    return ['-'.join(t[i:i+3]) for i in range(0, len(t)-2, 3)]

# ──────────────────────────────────────────────
# 4. 端基カウンタ用ユーティリティ
# ──────────────────────────────────────────────
def remove_one(counter: Counter, fg: str):
    if counter[fg] > 0:
        counter[fg] -= 1
        if counter[fg] == 0:
            del counter[fg]

def biggest_fragment(mol):
    frags = Chem.GetMolFrags(mol, asMols=True)
    return max(frags, key=lambda m: m.GetNumAtoms())

def build_polymer(mseq):
    # 初期
    base = Chem.MolFromSmiles(MONOMERS[mseq[0]]["smiles"])
    if base is None:
        return f"Failed (bad SMILES {mseq[0]})"
    ends = Counter(MONOMERS[mseq[0]]["ends"])

    for mk in mseq[1:]:
        nxt = Chem.MolFromSmiles(MONOMERS[mk]["smiles"])
        if nxt is None:
            return f"Failed (bad SMILES {mk})"
        n_end = Counter(MONOMERS[mk]["ends"])
        reacted = False

        # 酸 → フェノール の方向とその逆
        combos = [(base, nxt, ends.copy(), n_end.copy()),
                  (nxt, base, n_end.copy(), ends.copy())]

        for acid_mol, phen_mol, acid_end, ph_end in combos:
            if "COOH" not in acid_end or "PhOH" not in ph_end:
                continue
            prod_sets = rxn_phen_ester.RunReactants((acid_mol, phen_mol))
            if prod_sets:
                prod = biggest_fragment(prod_sets[0][0])
                base = prod
                remove_one(acid_end, "COOH")
                remove_one(ph_end,  "PhOH")
                ends = acid_end + ph_end
                reacted = True
                break
        if not reacted:
            return "Failed"
    return Chem.MolToSmiles(base)

def to_psmiles(smi: str):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return "Invalid"

    # 反応：カルボン酸 OH の H → [*]
    rxn_acid = rdChemReactions.ReactionFromSmarts(
        "[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]"
    )
    # 反応：フェノール OH の H → [*]
    rxn_ph = rdChemReactions.ReactionFromSmarts(
        "[c:1][O&H1:2]>>[c:1][*:3]"
    )

    for _ in range(2):  # 2 step: 酸→フェノール
        prods = rxn_acid.RunReactants((mol,))
        if prods:
            mol = biggest_fragment(prods[0][0])
    for _ in range(2):
        prods = rxn_ph.RunReactants((mol,))
        if prods:
            mol = biggest_fragment(prods[0][0])

    return Chem.MolToSmiles(mol)



# ──────────────────────────────────────────────
# 5. 実行：累積95 % まで
# ──────────────────────────────────────────────
rows, cum = [], 0.0
for seg, prob in SEG_FREQ:
    cum += prob
    monomers = seg_to_monomers(seg)
    smi = build_polymer(monomers)
    psmi = to_psmiles(smi) if not smi.startswith("Failed") else "Failed"
    rows.append([seg, f"{prob:.2%}", f"{cum:.2%}", monomers, smi, psmi])
    if cum >= 0.95:
        break

df = pd.DataFrame(rows,
                  columns=["Pattern","Prob.","Cum.","Monomer Seq","SMILES","p-SMILES"])
print(df.to_string(index=False))


                      Pattern  Prob.   Cum.                         Monomer Seq                                                                            SMILES                                                                           p-SMILES
                  a-A-b-a-A-b 25.30% 25.30%                      [a-A-b, a-A-b]                                                 O=C(O)c1ccc(OC(=O)c2ccc(O)cc2)cc1                                                 *OC(=O)c1ccc(OC(=O)c2ccc(*)cc2)cc1
                  a-C-a-b-B-b 20.18% 45.48%                      [a-C-a, b-B-b]                                                 O=C(O)c1ccc(C(=O)Oc2ccc(O)cc2)cc1                                                 *OC(=O)c1ccc(C(=O)Oc2ccc(*)cc2)cc1
            a-C-a-b-A-a-b-B-b 10.01% 55.49%               [a-C-a, b-A-a, b-B-b]                                 O=C(O)c1ccc(OC(=O)c2ccc(C(=O)Oc3ccc(O)cc3)cc2)cc1                                 *OC(=O)c1ccc(OC(=O)c2ccc(C(=O)Oc3ccc(*)cc3)cc2)cc1
            a-A-b-a-

[00:15:58] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] product atom-mapping number 3 not found in reactants.
[00:15:58] mapped atoms in the reactants were not mapped in the products.
  

In [72]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
auto_polysegment_with_pSMILES.py
--------------------------------
与えるものは
・MONOMERS（端基リスト付き SMILES）
・feed_ratios（A,B,C,D の元混合比）
だけ。残りの派生構造・バイアスは自動生成。
"""

import random, pandas as pd
from collections import Counter
from typing import List, Tuple
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# ────────────────────────────────────
# 0. ユーザ入力
# ────────────────────────────────────
MONOMERS = {
    "a-A-b": {"smiles": "O=C(O)c1ccc(O)cc1", "ends": ["COOH", "PhOH"]},
    "b-A-a": {"smiles": "O=C(O)c1ccc(O)cc1", "ends": ["PhOH", "COOH"]},
    "b-B-b": {"smiles": "Oc1ccc(O)cc1",       "ends": ["PhOH", "PhOH"]},
    "a-C-a": {"smiles": "O=C(O)c1ccc(C(=O)O)cc1", "ends": ["COOH", "COOH"]},
    "a-D-a": {"smiles": "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O","ends": ["COOH","COOH"]},
}
feed_ratios = {'A': 0.5, 'B': 0.25, 'C': 0.2, 'D': 0.05}

# ────────────────────────────────────
# 1. 自動派生：MONOMER_DEF / バイアス
# ────────────────────────────────────
def end_token(fg):          # COOH→a, PhOH→b
    return 'a' if fg == "COOH" else 'b'

MONOMER_DEF = {}
bias_end_b, bias_end_a = {}, {}
for key, data in MONOMERS.items():
    l, r = end_token(data["ends"][0]), end_token(data["ends"][1])
    MONOMER_DEF[key] = (l, r)
    base = key.split('-')[1]  # A, B, C …
    # 末端 b → 次は左端 a
    if l == 'a':
        bias_end_b[key] = 2 if l == r else 1
    # 末端 a → 次は左端 b
    if l == 'b':
        bias_end_a[key] = 2 if l == r else 1

# feed_ratio も reversed 版へコピー
for k in MONOMER_DEF:
    mid = k.split('-')[1].replace('_rev','')
    if mid not in feed_ratios:  # b-A-a のような reverse の feed は元と同じ
        feed_ratios[mid] = feed_ratios['A']

# ────────────────────────────────────
# 2. シミュレーション用関数
# ────────────────────────────────────
def _pick(cands: List[str], weights: List[float]) -> str:
    r, acc = random.random() * sum(weights), 0.0
    for c, w in zip(cands, weights):
        acc += w
        if r <= acc:
            return c
    return cands[-1]

def grow_chain(N=100):
    first = random.choices(
        list(feed_ratios.keys()),
        weights=list(feed_ratios.values())
    )[0]
    first_key = "a-A-b" if first == 'A' else f"a-{first}-a" if first in ['C','D'] else "b-B-b"
    chain, ori = [first_key], [False]
    end_fg = MONOMER_DEF[first_key][1]

    while len(chain) < N:
        if end_fg == 'b':
            cand = [k for k,v in MONOMER_DEF.items() if v[0]=='a']
            base = [feed_ratios[c.split('-')[1].replace('_rev','')] * bias_end_b[c] for c in cand]
        else:
            cand = [k for k,v in MONOMER_DEF.items() if v[0]=='b']
            base = [feed_ratios[c.split('-')[1].replace('_rev','')] * bias_end_a[c] for c in cand]

        nxt = _pick(cand, base)
        chain.append(nxt); ori.append(False)
        end_fg = MONOMER_DEF[nxt][1]
    return chain

def chain_to_tokens(chain):
    toks = []
    for key in chain:
        l,m,r = key.split('-')
        toks.extend([l, m, r])
    return toks

def canonicalize(tok):
    return min('-'.join(tok), '-'.join(tok[::-1]))

def extract_segments(tokens):
    U = {'A','B','C','D'}
    segs = []; i=0; N=len(tokens)
    while i<N-3:
        if tokens[i] not in {'a','b'} or tokens[i+1] not in U:
            i+=1; continue
        want = 'b' if tokens[i]=='a' else 'a'
        up, j = 0, i+1
        while j<N:
            up += tokens[j] in U
            if tokens[j]==want and up>=2 and tokens[j-1] in U:
                segs.append(canonicalize(tokens[i:j+1]))
                i=j; break
            j+=1
        else: break
    return segs

# ────────────────────────────────────
# 3. RDKit 反応 & p‑SMILES 生成
# ────────────────────────────────────
rxn_phen_ester = rdChemReactions.ReactionFromSmarts(
    "[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]"
)
rxn_acid = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]")
rxn_ph   = rdChemReactions.ReactionFromSmarts("[c:1][O&H1:2]>>[c:1][*:3]")

def biggest(m): return max(Chem.GetMolFrags(m,asMols=True), key=lambda x:x.GetNumAtoms())

def build_polymer(seq):
    mol = Chem.MolFromSmiles(MONOMERS[seq[0]]["smiles"])
    ends = Counter(MONOMERS[seq[0]]["ends"])
    for key in seq[1:]:
        nxt = Chem.MolFromSmiles(MONOMERS[key]["smiles"])
        n_end=Counter(MONOMERS[key]["ends"])
        for acid,phen,a_end,p_end in [(mol,nxt,ends,n_end),(nxt,mol,n_end,ends)]:
            if "COOH" in a_end and "PhOH" in p_end:
                prod=rxn_phen_ester.RunReactants((acid,phen))
                if prod:
                    mol=biggest(prod[0][0]); a_end["COOH"]-=1; p_end["PhOH"]-=1
                    ends=a_end+p_end; break
    return Chem.MolToSmiles(mol)

def to_psmiles(s):
    m=Chem.MolFromSmiles(s)
    for rxn in (rxn_acid,rxn_ph):
        while True:
            p=rxn.RunReactants((m,))
            if not p: break
            m=biggest(p[0][0])
    return Chem.MolToSmiles(m)

# ────────────────────────────────────
# 4. メイン
# ────────────────────────────────────
N_CHAINS, CHAIN_LEN = 10000, 100
all_seg=Counter()
for _ in range(N_CHAINS):
    ch=grow_chain(CHAIN_LEN)
    segs=extract_segments(chain_to_tokens(ch))
    all_seg.update(segs)

total=sum(all_seg.values()); cum=0; rows=[]
for seg,n in all_seg.most_common():
    p=n/total; cum+=p
    seq=seg.split('-'); mon=['-'.join(seq[i:i+3]) for i in range(0,len(seq)-2,3)]
    smi=build_polymer(mon); psmi=to_psmiles(smi)
    rows.append([seg,f"{p:.3%}",f"{cum:.3%}",mon,smi,psmi])
    if cum>=0.95: break

print(pd.DataFrame(rows,columns=["Pattern","Prob","Cum","Monomers","SMILES","p‑SMILES"]).to_string(index=False))


                                  Pattern    Prob     Cum                                          Monomers                                                                                                            SMILES                                                                                                           p‑SMILES
                              a-A-b-a-A-b 25.322% 25.322%                                    [a-A-b, a-A-b]                                                                                 O=C(O)c1ccc(OC(=O)c2ccc(O)cc2)cc1                                                                                 *OC(=O)c1ccc(OC(=O)c2ccc(*)cc2)cc1
                              a-C-a-b-B-b 20.149% 45.471%                                    [a-C-a, b-B-b]                                                                                 O=C(O)c1ccc(C(=O)Oc2ccc(O)cc2)cc1                                                                                 *OC(=O)c1ccc(C(=O)Oc2ccc

[00:27:12] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:27:12] product atom-mapping number 3 not found in reactants.
[00:27:12] product atom-mapping number 3 not found in reactants.
[00:27:12] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 


In [74]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
auto_polysegment_with_pSMILES.py
入力:
    base_smiles = {'A':smilesA, 'B':smilesB, ...}
    feed_ratios = {'A':0.5,'B':0.25,'C':0.2,'D':0.05}
ほかは自動生成で
    (1) ランダム鎖 → 最小部分構造
    (2) 累積95 % までを SMILES / p‑SMILES 付き表示
"""

import random, pandas as pd, sys
from collections import Counter
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# ───────────────────────────────
# 0. 入力（ここだけ書き換えればよい）
# ───────────────────────────────
base_smiles = {
    'A': "O=C(O)c1ccc(O)cc1",                          # COOH‑PhOH
    'B': "Oc1ccc(O)cc1",                               # PhOH‑PhOH
    'C': "O=C(O)c1ccc(C(=O)O)cc1",                     # COOH‑COOH
    'D': "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O"         # COOH‑COOH (異性体)
}
feed_ratios = {'A':0.5,'B':0.25,'C':0.2,'D':0.05}

N_CHAINS, CHAIN_LEN = 10000, 100       # シミュレーション規模
TARGET_CUM = 0.95                      # 表示する累積確率閾値

# ───────────────────────────────
# 1. 関数：末端官能基判定 & モノマー辞書生成
# ───────────────────────────────
def ends_from_smiles(smi):
    mol = Chem.MolFromSmiles(smi)
    patt_acid = Chem.MolFromSmarts("C(=O)O")
    patt_ph   = Chem.MolFromSmarts("cO")
    hits=[]
    for patt,label in [(patt_acid,'a'),(patt_ph,'b')]:
        for match in mol.GetSubstructMatches(patt):
            hits.append((match[1], label))   # OH の O を index とみなす
    if len(hits)<2:
        raise ValueError("末端官能基を 2個 検出できません")
    hits.sort(key=lambda x:x[0])             # index 昇順＝“左→右”
    return hits[0][1], hits[-1][1]           # token 左, token 右

MONOMERS, MONOMER_DEF = {}, {}
bias_end_a, bias_end_b = {}, {}

for letter,smi in base_smiles.items():
    l_token, r_token = ends_from_smiles(smi)
    key_fwd = f"{l_token}-{letter}-{r_token}"
    key_rev = f"{r_token}-{letter}-{l_token}"

    # forward
    MONOMERS[key_fwd]   = {"smiles": smi}
    MONOMER_DEF[key_fwd]= (l_token, r_token)
    # reverse （同 SMILES だが向き区別のため key だけ反転）
    MONOMERS[key_rev]   = {"smiles": smi}
    MONOMER_DEF[key_rev]= (r_token, l_token)

    # バイアス (同じ端なら 2, 異端なら 1)
    val_fwd = 2 if l_token==r_token else 1
    if l_token=='b': bias_end_b[key_fwd]=val_fwd
    else:            bias_end_a[key_fwd]=val_fwd
    if r_token=='b': bias_end_b[key_rev]=val_fwd
    else:            bias_end_a[key_rev]=val_fwd

# token→官能基マップ
token2fg = {'a':'COOH', 'b':'PhOH'}
for k,(l,r) in MONOMER_DEF.items():
    MONOMERS[k]["ends"]=[token2fg[l], token2fg[r]]

# ───────────────────────────────
# 2. チェーン成長
# ───────────────────────────────
def _pick(cands, weights):
    r= random.random()*sum(weights); acc=0
    for c,w in zip(cands,weights):
        acc+=w
        if r<=acc: return c
    return cands[-1]

def grow_chain():
    firstL = random.choices(list(feed_ratios), list(feed_ratios.values()))[0]
    # pick forward orientation by default
    for k in MONOMERS:
        if k.split('-')[1]==firstL:
            first=k; break
    chain=[first]; end_token = MONOMER_DEF[first][1]

    while len(chain)<CHAIN_LEN:
        if end_token=='b':   # 次は左端 a
            cand=[k for k,v in MONOMER_DEF.items() if v[0]=='a']
            base=[feed_ratios[c.split('-')[1]]*bias_end_b.get(c,1) for c in cand]
        else:                # 次は左端 b
            cand=[k for k,v in MONOMER_DEF.items() if v[0]=='b']
            base=[feed_ratios[c.split('-')[1]]*bias_end_a.get(c,1) for c in cand]
        nxt=_pick(cand,base)
        chain.append(nxt); end_token=MONOMER_DEF[nxt][1]
    return chain

# ───────────────────────────────
# 3. 最小部分構造抽出
# ───────────────────────────────
U={'A','B','C','D'}
def chain_to_tokens(chain):
    tok=[]
    for k in chain: tok+=k.split('-')
    return tok

def canon(tok):
    return min('-'.join(tok), '-'.join(tok[::-1]))

def extract(tokens):
    segs=[]; i=0; N=len(tokens)
    while i<N-3:
        if tokens[i] not in {'a','b'} or tokens[i+1] not in U: i+=1; continue
        want='b' if tokens[i]=='a' else 'a'; up=0; j=i+1
        while j<N:
            up+=tokens[j] in U
            if tokens[j]==want and up>=2 and tokens[j-1] in U:
                segs.append(canon(tokens[i:j+1])); i=j; break
            j+=1
        else: break
    return segs

# ───────────────────────────────
# 4. RDKit 反応設定
# ───────────────────────────────
rxn_ester = rdChemReactions.ReactionFromSmarts(
    "[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]"
)
rxn_acid = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]")
rxn_ph   = rdChemReactions.ReactionFromSmarts("[c:1][O&H1:2]>>[c:1][*:3]")

def biggest(m): return max(Chem.GetMolFrags(m,asMols=True), key=lambda x:x.GetNumAtoms())

def build_poly(seq):
    m=Chem.MolFromSmiles(MONOMERS[seq[0]]["smiles"])
    ends=Counter(MONOMERS[seq[0]]["ends"])
    for key in seq[1:]:
        n=Chem.MolFromSmiles(MONOMERS[key]["smiles"])
        n_end=Counter(MONOMERS[key]["ends"]); done=False
        for acid,phen,a_end,p_end in [(m,n,ends,n_end),(n,m,n_end,ends)]:
            if "COOH" in a_end and "PhOH" in p_end:
                prod=rxn_ester.RunReactants((acid,phen))
                if prod:
                    m=biggest(prod[0][0])
                    a_end["COOH"]-=1; p_end["PhOH"]-=1; ends=a_end+p_end; done=True; break
        if not done: return "Failed"
    return Chem.MolToSmiles(m)

def to_psmi(s):
    m=Chem.MolFromSmiles(s)
    for rx in (rxn_acid,rxn_ph):
        while True:
            p=rx.RunReactants((m,)); 
            if not p: break
            m=biggest(p[0][0])
    return Chem.MolToSmiles(m)

# ───────────────────────────────
# 5. シミュレーション & 集計
# ───────────────────────────────
random.seed(0)
seg_cnt, mono_tot=Counter(), Counter()
for _ in range(N_CHAINS):
    ch=grow_chain()
    mono_tot.update([k.split('-')[1] for k in ch])
    seg_cnt.update(extract(chain_to_tokens(ch)))

# 組成チェック
print("── Monomer composition check ──")
for m in feed_ratios:
    obs=mono_tot[m]/(N_CHAINS*CHAIN_LEN)
    print(f"{m}: expected {feed_ratios[m]:.2f} | observed {obs:.4f}")

# 最小部分構造テーブル
total=sum(seg_cnt.values()); cum=0; rows=[]
for seg,n in seg_cnt.most_common():
    p=n/total; cum+=p
    seq=['-'.join(seg.split('-')[i:i+3]) for i in range(0,len(seg.split('-'))-2,3)]
    smi=build_poly(seq); ps=to_psmi(smi) if smi!="Failed" else "N/A"
    rows.append([seg,f"{p:.3%}",f"{cum:.3%}",seq,smi,ps])
    if cum>=TARGET_CUM: break

print("\n── Pattern probabilities (cum ≥ 95 %) ──")
print(pd.DataFrame(rows,columns=["Pattern","Prob.","Cum.","Monomer Seq","SMILES","p‑SMILES"]).to_string(index=False))


── Monomer composition check ──
A: expected 0.50 | observed 0.6654
B: expected 0.25 | observed 0.1660
C: expected 0.20 | observed 0.1349
D: expected 0.05 | observed 0.0337

── Pattern probabilities (cum ≥ 95 %) ──
                                                    Pattern   Prob.    Cum.                                                            Monomer Seq                                                                                                                                                                SMILES                                                                                                                                                               p‑SMILES
                                                a-A-b-a-A-b 44.698% 44.698%                                                         [a-A-b, a-A-b]                                                                                                                                     O=C(O)c1ccc(OC(=O)c2ccc(O)c

[00:27:35] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[00:27:35] product atom-mapping number 3 not found in reactants.
[00:27:35] product atom-mapping number 3 not found in reactants.
[00:27:35] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 


In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
auto_polysegment_with_pSMILES_fixed.py
--------------------------------------
入力:
    base_smiles … A,B,C,D それぞれ 1 本
    feed_ratios … {'A':…, 'B':…, 'C':…, 'D':…}
出力:
    ・モノマー組成チェック (expected ≈ observed)
    ・累積 95 % までの Pattern / SMILES / p‑SMILES
"""

import random, pandas as pd
from collections import Counter
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# ───── 0. ユーザ入力 ─────
base_smiles = {
    'A': "O=C(O)c1ccc(O)cc1",
    'B': "Oc1ccc(O)cc1",
    'C': "O=C(O)c1ccc(C(=O)O)cc1",
    'D': "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O"
}
feed_ratios = {'A':0.5,'B':0.25,'C':0.2,'D':0.05}

N_CHAINS, CHAIN_LEN = 10000, 100
TARGET_CUM = 0.95
random.seed(0)

# ───── 1. 末端 token 判定 & モノマー辞書 ─────
def ends_from_smiles(smi):
    mol=Chem.MolFromSmiles(smi)
    acid = Chem.MolFromSmarts("C(=O)O")
    phen = Chem.MolFromSmarts("cO")
    hits=[]
    for patt,lab in [(acid,'a'),(phen,'b')]:
        for m in mol.GetSubstructMatches(patt):
            hits.append((m[1],lab))           # –OH の O index
    hits.sort(key=lambda x:x[0])
    if len(hits)<2: raise ValueError("端基2つ検出できず")
    return hits[0][1], hits[-1][1]

MONOMERS, MON_DEF={},{}
sym_factor={}       # 対称性 1 or 2
for L,smi in base_smiles.items():
    l,r=ends_from_smiles(smi)
    fwd=f"{l}-{L}-{r}"; rev=f"{r}-{L}-{l}"
    MONOMERS[fwd]=MONOMERS[rev]={"smiles":smi}
    MON_DEF[fwd]=(l,r); MON_DEF[rev]=(r,l)
    sym_factor[L]=2 if l==r else 1            # a-a or b-b は重み ×2

token2fg={'a':'COOH','b':'PhOH'}
for k,(l,r) in MON_DEF.items():
    MONOMERS[k]["ends"]=[token2fg[l],token2fg[r]]

# ───── 2. チェーン成長 ─────
def pick_letter(weights):
    r=random.random()*sum(weights.values()); acc=0
    for L,w in weights.items():
        acc+=w
        if r<=acc: return L
    return L

def grow_chain():
    chain=[]
    # 初期モノマー (向きは固定: a‑L‑a or b‑B‑b)
    firstL = pick_letter(feed_ratios)
    if firstL=='A': key="a-A-b"
    elif firstL in ['C','D']: key=f"a-{firstL}-a"
    else: key="b-B-b"
    chain.append(key); end_token=MON_DEF[key][1]

    while len(chain)<CHAIN_LEN:
        if end_token=='b':   # 次は左端 a（A,C,D）
            cand={'A':feed_ratios['A']*sym_factor['A'],
                  'C':feed_ratios['C']*sym_factor['C'],
                  'D':feed_ratios['D']*sym_factor['D']}
            L=pick_letter(cand)
            key="a-A-b" if L=='A' else f"a-{L}-a"
        else:                # 次は左端 b（A_rev,B）
            cand={'A':feed_ratios['A']*sym_factor['A'],
                  'B':feed_ratios['B']*sym_factor['B']}
            L=pick_letter(cand)
            key="b-A-a" if L=='A' else "b-B-b"
        chain.append(key); end_token=MON_DEF[key][1]
    return chain

# ― 3. segment 抽出（以前と同じ） ―
U={'A','B','C','D'}
def tok_chain(chain):
    tok=[]
    for k in chain: tok+=k.split('-')
    return tok
def canon(t): return min('-'.join(t), '-'.join(t[::-1]))
def extract(tok):
    segs=[]; i=0; N=len(tok)
    while i<N-3:
        if tok[i] not in {'a','b'} or tok[i+1] not in U: i+=1; continue
        want='b' if tok[i]=='a' else 'a'; up=0; j=i+1
        while j<N:
            up+=tok[j] in U
            if tok[j]==want and up>=2 and tok[j-1] in U:
                segs.append(canon(tok[i:j+1])); i=j; break
            j+=1
        else: break
    return segs

# ― 4. RDKit 反応 & p‑SMILES ―
rxn_ester=rdChemReactions.ReactionFromSmarts(
    "[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]"
)
rx_acid =rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]")
rx_ph   =rdChemReactions.ReactionFromSmarts("[c:1][O&H1:2]>>[c:1][*:3]")
def big(m): return max(Chem.GetMolFrags(m,asMols=True), key=lambda x:x.GetNumAtoms())
def build(seq):
    m=Chem.MolFromSmiles(MONOMERS[seq[0]]["smiles"]); ends=Counter(MONOMERS[seq[0]]["ends"])
    for k in seq[1:]:
        n=Chem.MolFromSmiles(MONOMERS[k]["smiles"]); ne=Counter(MONOMERS[k]["ends"])
        for acid,phen,a_end,p_end in [(m,n,ends,ne),(n,m,ne,ends)]:
            if "COOH" in a_end and "PhOH" in p_end:
                pro=rxn_ester.RunReactants((acid,phen))
                if pro:
                    m=big(pro[0][0]); a_end["COOH"]-=1; p_end["PhOH"]-=1; ends=a_end+p_end; break
    return Chem.MolToSmiles(m)
def psmiles(s):
    m=Chem.MolFromSmiles(s)
    for rx in (rx_acid,rx_ph):
        while True:
            p=rx.RunReactants((m,)); 
            if not p: break
            m=big(p[0][0])
    return Chem.MolToSmiles(m)

# ― 5. シミュレーション ―
seg_cnt, mono_cnt=Counter(), Counter()
for _ in range(N_CHAINS):
    ch=grow_chain(); mono_cnt.update([k.split('-')[1] for k in ch])
    seg_cnt.update(extract(tok_chain(ch)))

print("── Monomer composition check ──")
for L in ['A','B','C','D']:
    obs=mono_cnt[L]/(N_CHAINS*CHAIN_LEN)
    print(f"{L}: expected {feed_ratios[L]:.2f} | observed {obs:.4f}")

tot=sum(seg_cnt.values()); cum=0; rows=[]
for seg,n in seg_cnt.most_common():
    p=n/tot; cum+=p
    mon=['-'.join(seg.split('-')[i:i+3]) for i in range(0,len(seg.split('-'))-2,3)]
    smi=build(mon); ps=psmiles(smi)
    rows.append([seg,f"{p:.3%}",f"{cum:.3%}",mon,smi,ps])
    if cum>=TARGET_CUM: break

print("\n── Pattern probabilities (cum ≥ 95 %) ──")
print(pd.DataFrame(rows,columns=["Pattern","Prob","Cum","Monomers","SMILES","p‑SMILES"]).to_string(index=False))


── Monomer composition check ──
A: expected 0.50 | observed 0.5004
B: expected 0.25 | observed 0.2485
C: expected 0.20 | observed 0.2009
D: expected 0.05 | observed 0.0502

── Pattern probabilities (cum ≥ 95 %) ──
                                  Pattern    Prob     Cum                                          Monomers                                                                                                            SMILES                                                                                                           p‑SMILES
                              a-A-b-a-A-b 25.353% 25.353%                                    [a-A-b, a-A-b]                                                                                 O=C(O)c1ccc(OC(=O)c2ccc(O)cc2)cc1                                                                                 *OC(=O)c1ccc(OC(=O)c2ccc(*)cc2)cc1
                              a-C-a-b-B-b 20.156% 45.509%                                    [a-C-a, b-B-b]     

[11:37:12] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[11:37:12] product atom-mapping number 3 not found in reactants.
[11:37:12] product atom-mapping number 3 not found in reactants.
[11:37:12] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 


In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
auto_polysegment_with_pSMILES_optimized.py
"""

import random, pandas as pd
from collections import Counter
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# ───── 0. ユーザ入力 ─────
base_smiles = {
    'A': "O=C(O)c1ccc(O)cc1",
    'B': "Oc1ccc(O)cc1",
    'C': "O=C(O)c1ccc(C(=O)O)cc1",
    'D': "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O"
}
feed_ratios = {'A':0.5,'B':0.25,'C':0.2,'D':0.05}

N_CHAINS, CHAIN_LEN = 10000, 100
TARGET_CUM = 0.95
random.seed(0)

# ───── 1. ends_from_smiles キャッシュ ─────
def ends_from_smiles(smi):
    mol = Chem.MolFromSmiles(smi)
    acid = Chem.MolFromSmarts("C(=O)O")
    phen = Chem.MolFromSmarts("cO")
    hits = []
    for patt, lab in [(acid, 'a'), (phen, 'b')]:
        for m in mol.GetSubstructMatches(patt):
            hits.append((m[1], lab))  # –OH の O index
    hits.sort(key=lambda x: x[0])
    if len(hits) < 2:
        raise ValueError("端基2つ検出できず")
    return hits[0][1], hits[-1][1]

monomer_names = list(base_smiles.keys())
ends_cache = {L: ends_from_smiles(base_smiles[L]) for L in monomer_names}

MONOMERS, MON_DEF, sym_factor = {}, {}, {}
for L in monomer_names:
    l, r = ends_cache[L]
    smi = base_smiles[L]
    fwd = f"{l}-{L}-{r}"
    rev = f"{r}-{L}-{l}"
    MONOMERS[fwd] = MONOMERS[rev] = {"smiles": smi}
    MON_DEF[fwd] = (l, r)
    MON_DEF[rev] = (r, l)
    sym_factor[L] = 2 if l == r else 1

token2fg = {'a': 'COOH', 'b': 'PhOH'}
for k, (l, r) in MON_DEF.items():
    MONOMERS[k]["ends"] = [token2fg[l], token2fg[r]]

# a, b 対応モノマー事前抽出
a_monomers = [L for L in monomer_names if 'a' in ends_cache[L]]
b_monomers = [L for L in monomer_names if 'b' in ends_cache[L]]

# ───── 2. チェーン成長 ─────
def pick_letter(weights):
    r = random.random() * sum(weights.values())
    acc = 0
    for L, w in weights.items():
        acc += w
        if r <= acc:
            return L
    return list(weights.keys())[-1]

def grow_chain():
    chain = []
    firstL = pick_letter(feed_ratios)
    l, r = ends_cache[firstL]
    key = f"{l}-{firstL}-{r}"
    chain.append(key)
    end_token = MON_DEF[key][1]

    while len(chain) < CHAIN_LEN:
        if end_token == 'b':
            cand = {L: feed_ratios[L] * sym_factor[L] for L in a_monomers}
        else:
            cand = {L: feed_ratios[L] * sym_factor[L] for L in b_monomers}
        L = pick_letter(cand)
        l0, r0 = ends_cache[L]
        if end_token == 'b':
            key = f"a-{L}-{r0}" if l0 == 'a' else f"a-{L}-{l0}"
        else:
            key = f"b-{L}-{r0}" if l0 == 'b' else f"b-{L}-{l0}"
        chain.append(key)
        end_token = MON_DEF[key][1]

    return chain

# ───── 3. segment 抽出 ─────
U = set(monomer_names)
def tok_chain(chain):
    tok = []
    for k in chain:
        tok += k.split('-')
    return tok

def canon(t):
    return min('-'.join(t), '-'.join(t[::-1]))

def extract(tok):
    segs = []
    i = 0
    N = len(tok)
    while i < N - 3:
        if tok[i] not in {'a', 'b'} or tok[i+1] not in U:
            i += 1
            continue
        want = 'b' if tok[i] == 'a' else 'a'
        up = 0
        j = i + 1
        while j < N:
            if tok[j] in U:
                up += 1
            if tok[j] == want and up >= 2 and tok[j-1] in U:
                segs.append(canon(tok[i:j+1]))
                i = j
                break
            j += 1
        else:
            break
    return segs

# ───── 4. RDKit 反応 & p-SMILES ─────
rxn_ester = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]")
rx_acid = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]")
rx_ph = rdChemReactions.ReactionFromSmarts("[c:1][O&H1:2]>>[c:1][*:3]")

def big(m):
    return max(Chem.GetMolFrags(m, asMols=True), key=lambda x: x.GetNumAtoms())

def build(seq):
    if not seq:
        return ""
    m = Chem.MolFromSmiles(MONOMERS[seq[0]]["smiles"])
    ends = Counter(MONOMERS[seq[0]]["ends"])
    for k in seq[1:]:
        n = Chem.MolFromSmiles(MONOMERS[k]["smiles"])
        ne = Counter(MONOMERS[k]["ends"])
        for acid, phenol, a_end, p_end in [(m, n, ends, ne), (n, m, ne, ends)]:
            if "COOH" in a_end and "PhOH" in p_end:
                prods = rxn_ester.RunReactants((acid, phenol))
                if prods:
                    m = big(prods[0][0])
                    a_end["COOH"] -= 1
                    p_end["PhOH"] -= 1
                    ends = a_end + p_end
                    break
    return Chem.MolToSmiles(m)

def psmiles(s):
    if not s:
        return ""
    m = Chem.MolFromSmiles(s)
    if m is None:
        return ""
    for rx in (rx_acid, rx_ph):
        while True:
            prods = rx.RunReactants((m,))
            if not prods:
                break
            m = big(prods[0][0])
    return Chem.MolToSmiles(m)

# ───── 5. シミュレーション ─────
seg_cnt, mono_cnt = Counter(), Counter()
for _ in range(N_CHAINS):
    ch = grow_chain()
    mono_cnt.update([k.split('-')[1] for k in ch])
    seg_cnt.update(extract(tok_chain(ch)))

print("── Monomer composition check ──")
total_monomers = N_CHAINS * CHAIN_LEN
for L in monomer_names:
    expected = feed_ratios.get(L, 0)
    observed = mono_cnt.get(L, 0) / total_monomers
    print(f"{L}: expected {expected:.2f} | observed {observed:.4f}")

tot = sum(seg_cnt.values())
cum = 0.0
rows = []

for seg, n in seg_cnt.most_common():
    p = n / tot
    cum += p
    parts = seg.split('-')
    monomers_in_seg = []
    for i in range(0, len(parts) - 2, 3):
        mono_key = '-'.join(parts[i:i+3])
        monomers_in_seg.append(mono_key)
    smi = build(monomers_in_seg)
    ps = psmiles(smi) if smi else ""
    rows.append({
        "Pattern": seg,
        "Prob": f"{p:.3%}",
        "Cum": f"{cum:.3%}",
        "Monomers": monomers_in_seg,
        "SMILES": smi,
        "p‑SMILES": ps
    })
    if cum >= TARGET_CUM:
        break

print(f"\n── Top {len(rows)} Pattern probabilities (cum ≥ {TARGET_CUM*100:.0f}%) ──")
df = pd.DataFrame(rows)
print(df.to_string(index=False))


── Monomer composition check ──
A: expected 0.50 | observed 0.5004
B: expected 0.25 | observed 0.2485
C: expected 0.20 | observed 0.2009
D: expected 0.05 | observed 0.0502

── Top 20 Pattern probabilities (cum ≥ 95%) ──
                                  Pattern    Prob     Cum                                          Monomers                                                                                                            SMILES                                                                                                           p‑SMILES
                              a-A-b-a-A-b 25.353% 25.353%                                    [a-A-b, a-A-b]                                                                                 O=C(O)c1ccc(OC(=O)c2ccc(O)cc2)cc1                                                                                 *OC(=O)c1ccc(OC(=O)c2ccc(*)cc2)cc1
                              a-C-a-b-B-b 20.156% 45.509%                                    [a-C-a, b-B-b

[11:38:47] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[11:38:47] product atom-mapping number 3 not found in reactants.
[11:38:47] product atom-mapping number 3 not found in reactants.
[11:38:47] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 


In [43]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Optimized auto_polysegment_with_pSMILES_fixed_amide.py
セグメント抽出ルールを強化したバージョン（修正版）
"""

import random, pandas as pd
from collections import Counter
from rdkit import Chem
from rdkit.Chem import rdChemReactions

# --- 0. ユーザー定義 ---
base_smiles = {
    'A': "O=C(O)c1ccc(O)cc1",
    'B': "Oc1ccc(O)cc1",
    'C': "O=C(O)c1ccc(C(=O)O)cc1",
    'D': "C1=C2C(C=C(C(O)=O)C=C2)=CC=C1C(O)=O",
    'E': "Oc1ccc(N)cc1"

}
feed_ratios = {'A':0.5,'B':0.125,'C':0.2,'D':0.05,'E':0.125}

N_CHAINS, CHAIN_LEN = 10000, 100
TARGET_CUM = 0.95
random.seed(0)

# --- 1. SMILESと官能基キャッシュ ---
def ends_from_smiles(smi):
    mol = Chem.MolFromSmiles(smi)
    acid = Chem.MolFromSmarts("[CX3](=O)[OX2H1]")
    phen = Chem.MolFromSmarts("[c]O")
    amine = Chem.MolFromSmarts("[NH2]")
    hits = []
    for patt, lab in [(acid, 'a'), (phen, 'b'), (amine, 'c')]:
        for match in mol.GetSubstructMatches(patt):
            if lab == 'a':  # カルボン酸: 炭素原子
                idx = match[0]
            elif lab == 'b':  # フェノール: 酸素原子
                idx = match[1]
            else:  # アミン: 窒素原子
                idx = match[0]
            hits.append((idx, lab))
    hits.sort(key=lambda x: x[0])
    if len(hits) < 2:
        raise ValueError(f"端基検出失敗: {smi}")
    return hits[0][1], hits[-1][1]

monomer_names = list(base_smiles.keys())
ends_cache = {}
for L, smi in base_smiles.items():
    try:
        ends_cache[L] = ends_from_smiles(smi)
    except Exception as e:
        print(f"エラー: {e}")
        ends_cache[L] = ('a', 'a')

MONOMERS, MON_DEF, sym_factor = {}, {}, {}
token2fg = {'a': 'COOH', 'b': 'PhOH', 'c': 'NH2'}

for L in monomer_names:
    l, r = ends_cache[L]
    smi = base_smiles[L]
    
    for key in [f"{l}-{L}-{r}", f"{r}-{L}-{l}"]:
        MONOMERS[key] = {
            "smiles": smi,
            "ends": [token2fg[key[0]], token2fg[key[-1]]]
        }
        MON_DEF[key] = (key[0], key[-1])
    
    sym_factor[L] = 2 if l == r else 1

REACTION_RULES = {'a': ['b', 'c'], 'b': ['a'], 'c': ['a']}

# --- 2. チェーン生成 ---
def pick_letter(weights):
    total = sum(weights.values())
    r = random.random() * total
    acc = 0
    for L, w in weights.items():
        acc += w
        if r <= acc:
            return L
    return list(weights.keys())[-1]

def grow_chain():
    chain = []
    first_weights = {L: feed_ratios[L] * sym_factor[L] for L in feed_ratios}
    firstL = pick_letter(first_weights)
    l, r = ends_cache[firstL]
    
    if l == r and random.random() < 0.5:
        key = f"{r}-{firstL}-{l}"
    else:
        key = f"{l}-{firstL}-{r}"
    
    chain.append(key)
    end_token = MON_DEF[key][1]
    
    while len(chain) < CHAIN_LEN:
        cand = {}
        for L in base_smiles:
            l0, r0 = ends_cache[L]
            w = feed_ratios[L] * sym_factor[L]
            
            if l0 in REACTION_RULES[end_token]:
                cand[f"{l0}-{L}-{r0}"] = w
            if r0 in REACTION_RULES[end_token] and l0 != r0:
                cand[f"{r0}-{L}-{l0}"] = w
        
        if not cand:
            end_token = 'a' if end_token != 'a' else 'b'
            continue
            
        key = pick_letter(cand)
        chain.append(key)
        end_token = MON_DEF[key][1]
    
    return chain

# --- 3. セグメント抽出（修正版）---
# --- 3. セグメント抽出（修正版）---
def tok_chain(chain):
    return [tok for seg in chain for tok in seg.split('-')]

def canon(tokens):
    return min('-'.join(tokens), '-'.join(tokens[::-1]))

def extract(tokens):
    segs = []
    n = len(tokens)
    
    # モノマー境界を検出
    mono_starts = []
    for i in range(0, n-2, 1):
        if tokens[i] in ['a','b','c'] and tokens[i+1] in monomer_names and tokens[i+2] in ['a','b','c']:
            mono_starts.append(i)
    
    # 各モノマー開始点からセグメントを抽出
    for start_idx in mono_starts:
        start_tok = tokens[start_idx]
        
        # 開始トークンに基づいて終了トークンを決定
        if start_tok in ['b', 'c']:
            end_target = 'a'
        else:  # 'a'
            # 修正: a開始の場合、終端はbまたはc
            end_target = ['b', 'c']
        
        # 次のモノマー境界から探索
        for next_start in mono_starts:
            if next_start <= start_idx:
                continue
                
            # 終端トークン位置 (モノマーの終端)
            end_idx = next_start + 2
            
            if end_idx >= n:
                break
                
            # 修正: 終端トークンのチェックを柔軟に
            if (isinstance(end_target, str) and tokens[end_idx] == end_target) or \
               (isinstance(end_target, list) and tokens[end_idx] in end_target):
                # 少なくとも2つの完全なモノマーを含む
                seg_tokens = tokens[start_idx:end_idx+1]
                segs.append(canon(seg_tokens))
                break
    
    return segs
# --- 4. 化学反応処理 ---
rxn_ester = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2].[c:3][O&H1:4]>>[C:1](=O)[O:4][c:3]")
rxn_amide = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2].[NH2:3]>>[C:1](=O)[NH:3]")
rx_acid = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O&H1:2]>>[C:1](=O)[O:2][*:3]")
rx_ph = rdChemReactions.ReactionFromSmarts("[c:1][O&H1:2]>>[c:1][O:2][*:3]")
rx_amine = rdChemReactions.ReactionFromSmarts("[NH2:1]>>[NH:1][*:2]")

def get_largest_fragment(mol):
    frags = Chem.GetMolFrags(mol, asMols=True)
    return max(frags, key=lambda x: x.GetNumAtoms()) if frags else mol

# --- 4. 化学反応処理（結合順序を考慮した修正版）---
def build_polymer(seq):
    if not seq:
        return ""
    
    # モノマー列から分子オブジェクトと端基情報を取得
    mols = []
    left_ends = []
    right_ends = []
    
    for key in seq:
        mol = Chem.MolFromSmiles(MONOMERS[key]["smiles"])
        if mol is None:
            return ""
        mols.append(mol)
        
        # モノマーの左端と右端の官能基を取得
        left_token, right_token = MON_DEF[key]
        left_ends.append(token2fg[left_token])
        right_ends.append(token2fg[right_token])
    
    # 最初のモノマーから開始
    polymer = mols[0]
    current_right = right_ends[0]
    
    # モノマーを順次結合
    for i in range(1, len(seq)):
        next_mol = mols[i]
        next_left = left_ends[i]
        next_right = right_ends[i]
        
        reacted = False
        
        # 現在のポリマーの右端と次のモノマーの左端に基づいて反応
        if current_right == 'COOH' and next_left == 'PhOH':
            prods = rxn_ester.RunReactants((polymer, next_mol))
            if prods:
                polymer = get_largest_fragment(prods[0][0])
                reacted = True
                
        elif current_right == 'COOH' and next_left == 'NH2':
            prods = rxn_amide.RunReactants((polymer, next_mol))
            if prods:
                polymer = get_largest_fragment(prods[0][0])
                reacted = True
                
        elif current_right == 'PhOH' and next_left == 'COOH':
            prods = rxn_ester.RunReactants((next_mol, polymer))
            if prods:
                polymer = get_largest_fragment(prods[0][0])
                reacted = True
                
        elif current_right == 'NH2' and next_left == 'COOH':
            prods = rxn_amide.RunReactants((next_mol, polymer))
            if prods:
                polymer = get_largest_fragment(prods[0][0])
                reacted = True
        
        if not reacted:
            # 反応しない場合は単純結合
            polymer = get_largest_fragment(Chem.CombineMols(polymer, next_mol))
        
        # 現在の右端を更新
        current_right = next_right
    
    return Chem.MolToSmiles(polymer)

def generate_psmiles(smiles):
    if not smiles:
        return ""
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return ""
    
    for rx in [rx_acid, rx_ph, rx_amine]:
        while True:
            prods = rx.RunReactants((mol,))
            if not prods:
                break
            mol = get_largest_fragment(prods[0][0])
    
    return Chem.MolToSmiles(mol)

# --- 5. 実行＆出力 ---
print("シミュレーション開始...")
segment_counter = Counter()
monomer_counter = Counter()

for i in range(N_CHAINS):
    chain = grow_chain()
    monomer_counter.update([key.split('-')[1] for key in chain])
    tokens = tok_chain(chain)
    segments = extract(tokens)
    segment_counter.update(segments)

# モノマー組成チェック
print("\n── モノマー組成チェック ──")
total_monomers = N_CHAINS * CHAIN_LEN
for monomer in monomer_names:
    expected = feed_ratios[monomer]
    observed = monomer_counter[monomer] / total_monomers
    print(f"{monomer}: expected {expected:.4f} | observed {observed:.4f}")

# セグメント処理
print("\nセグメント処理中...")
total_segments = sum(segment_counter.values())
cumulative_prob = 0.0
results = []

for segment, count in segment_counter.most_common():
    prob = count / total_segments
    cumulative_prob += prob
    
    # セグメントからモノマー列を再構築
    tokens = segment.split('-')
    mono_sequence = []
    for i in range(0, len(tokens), 3):
        if i+3 > len(tokens):
            break
        mono_key = '-'.join(tokens[i:i+3])
        if mono_key in MONOMERS:
            mono_sequence.append(mono_key)
    
    # ポリマー構築とpSMILES生成
    try:
        smiles = build_polymer(mono_sequence)
        psmiles_val = generate_psmiles(smiles) if smiles else ""
    except Exception as e:
        print(f"エラー: {e}, セグメント: {segment}")
        smiles, psmiles_val = "", ""
    
    results.append({
        "Pattern": segment,
        "Prob": f"{prob:.3%}",
        "Cum": f"{cumulative_prob:.3%}",
        "Monomers": mono_sequence,
        "SMILES": smiles,
        "p-SMILES": psmiles_val
    })
    
    if cumulative_prob >= TARGET_CUM:
        break

# 結果表示
print(f"\n── 上位 {len(results)} パターン (累積 {TARGET_CUM*100:.0f}%) ──")
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
print(df.to_string(index=False))

シミュレーション開始...

── モノマー組成チェック ──
A: expected 0.5000 | observed 0.4991
B: expected 0.1250 | observed 0.1251
C: expected 0.2000 | observed 0.2012
D: expected 0.0500 | observed 0.0504
E: expected 0.1250 | observed 0.1243

セグメント処理中...

── 上位 58 パターン (累積 95%) ──
                                  Pattern    Prob     Cum                                          Monomers                                                                                                            SMILES                                                                                                            p-SMILES
                              a-A-b-a-A-b 25.219% 25.219%                                    [a-A-b, a-A-b]                                                                                 O=C(O)c1ccc(OC(=O)c2ccc(O)cc2)cc1                                                                                 *OC(=O)c1ccc(OC(=O)c2ccc(O*)cc2)cc1
                              a-C-a-b-B-b 10.107% 35.326%          

[13:52:15] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
[13:52:15] product atom-mapping number 3 not found in reactants.
[13:52:15] product atom-mapping number 3 not found in reactants.
[13:52:15] product atom-mapping number 2 not found in reactants.
[13:52:15] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 2 
