In [144]:
"""
Condensation‑polymer simulation & minimal‑segment analysis

ルール要約
---------
1. モノマー      : A,B,C,D   （A は向き反転 A_rev も使用）
2. 末端官能基が b → 候補 {A,C,D}、バイアス bias_end_b
               a → 候補 {A_rev,B}、バイアス bias_end_a
3. 最小部分構造  : 
       start = a|b,  end = b|a (反対)
       - 区間内に大文字 ≥2
       - ★start 直後のトークンは必ず大文字  （小文字小文字で始まらない）
       - end 直前のトークンも大文字        （小文字小文字で終わらない）
4. 期待モノマー比 feed_ratios とシミュレーション平均を比較
5. 各鎖の最小部分構造をカウントし，全鎖合算で頻度上位を表示
"""

import random
from collections import Counter
from typing import List, Tuple

# ────────────── 0. PARAMETERS ──────────────
N_CHAINS  = 100000
CHAIN_LEN = 100

feed_ratios = {'A': 0.5, 'B': 0.25, 'C': 0.2, 'D': 0.05}

bias_end_b = {'A': 1.0, 'C': 2.0, 'D': 2.0}      # 末端 b
bias_end_a = {'A_rev': 1.0, 'B': 2.0}            # 末端 a

MONOMER_DEF = {
    'A'     : ('a', 'b'),
    'A_rev' : ('b', 'a'),
    'B'     : ('b', 'b'),
    'C'     : ('a', 'a'),
    'D'     : ('a', 'a'),
}
UPPER_SET = {'A', 'B', 'C', 'D'}

# ────────────── 1. CHAIN GROWTH ──────────────
def _pick(cands: List[str], weights: List[float]) -> str:
    r, acc = random.random() * sum(weights), 0.0
    for c, w in zip(cands, weights):
        acc += w
        if r <= acc:
            return c
    return cands[-1]

def grow_chain() -> Tuple[List[str], List[bool]]:
    first = random.choices(
        ['A','B','C','D'],
        weights=[feed_ratios[m] for m in ['A','B','C','D']]
    )[0]
    chain, ori = [first], [False]
    end_fg = MONOMER_DEF[first][1]

    while len(chain) < CHAIN_LEN:
        if end_fg == 'b':
            cand = ['A','C','D']
            base = [feed_ratios[m] * bias_end_b[m] for m in cand]
        else:                  # end_fg == 'a'
            cand = ['A_rev','B']
            base = [feed_ratios['A'] * bias_end_a['A_rev'],
                    feed_ratios['B'] * bias_end_a['B']]
        ch = _pick(cand, base)
        chain.append('A' if ch == 'A_rev' else ch)
        ori  .append(ch == 'A_rev')
        end_fg = MONOMER_DEF[ch][1]
    return chain, ori

# ────────────── 2. TOKENISATION ──────────────
def chain_to_tokens(chain: List[str], ori: List[bool]) -> List[str]:
    toks = []
    for m, rev in zip(chain, ori):
        tag = 'A_rev' if (m == 'A' and rev) else m
        l, r = MONOMER_DEF[tag]
        toks += [l, m, r]
    return toks

# ────────────── 3. MINIMAL‑SEGMENT EXTRACTION ──────────────
def extract_segments(tokens: List[str]) -> List[str]:
    segs, i, N = [], 0, len(tokens)
    while i < N - 3:                           # 最低でも a A … A b で長さ≥4
        if tokens[i] not in {'a','b'}:         # 開始は小文字
            i += 1; continue
        # ★開始直後が小文字なら不適 ―― skip
        if tokens[i+1] not in UPPER_SET:
            i += 1; continue

        start, want = i, ('b' if tokens[i]=='a' else 'a')
        upper_cnt, j = 0, i+1
        found = False
        while j < N:
            t = tokens[j]
            if t in UPPER_SET:
                upper_cnt += 1
            if (t == want and upper_cnt >= 2
                    and tokens[j-1] in UPPER_SET):   # ★終了直前も大文字
                segs.append('-'.join(tokens[start:j+1]))
                i = j                      # 次探索は区切り文字から
                found = True
                break
            j += 1
        if not found:
            break
    return segs

# ────────────── 4. SIMULATION ──────────────
random.seed(None)
all_segments = Counter()
mono_totals  = Counter()

for _ in range(N_CHAINS):
    chain, ori = grow_chain()
    mono_totals.update(chain)
    segs = extract_segments(chain_to_tokens(chain, ori))
    all_segments.update(segs)

# ────────────── 5. OUTPUT ──────────────
print("── Monomer composition check ──")
avg = {m: mono_totals[m] / (N_CHAINS*CHAIN_LEN) for m in ['A','B','C','D']}
for m in ['A','B','C','D']:
    print(f"{m}: expected {feed_ratios[m]:.2f}  |  observed {avg[m]:.4f}")

print("\n── Pattern probabilities (top‑20) ──")
total_seg = sum(all_segments.values())
for seg, n in all_segments.most_common(20):
    p = n / total_seg
    print(f"{seg:55s} : {n:6d}  ({p:6.3%})")


── Monomer composition check ──
A: expected 0.50  |  observed 0.5001
B: expected 0.25  |  observed 0.2487
C: expected 0.20  |  observed 0.2010
D: expected 0.05  |  observed 0.0502

── Pattern probabilities (top‑20) ──
a-A-b-a-A-b                                             : 635322  (19.188%)
a-C-a-b-B-b                                             : 497837  (15.036%)
a-A-b-a-C-a-b-B-b                                       : 251424  (7.594%)
a-C-a-b-A-a-b-B-b                                       : 246669  (7.450%)
b-A-a-b-A-a                                             : 201208  (6.077%)
b-B-b-a-C-a                                             : 170828  (5.159%)
a-A-b-a-C-a-b-A-a-b-B-b                                 : 124568  (3.762%)
a-D-a-b-B-b                                             : 123888  (3.742%)
a-C-a-b-A-a-b-A-a-b-B-b                                 : 122177  (3.690%)
b-B-b-a-A-b-a-C-a                                       :  84461  (2.551%)
b-A-a-b-B-b-a-C-a             

In [157]:
# -*- coding: utf-8 -*-
"""
pattern_polymer_builder.py
  - パターン文字列 → エステル縮合 → 末端[*] 置換 → SMILES
  - RDKit 2023.09 以降推奨
"""
from rdkit import Chem
from rdkit.Chem import AllChem

# ───────────────────────────────
# 0. モノマー雛形（ラベル付き）
#    :a = COOH の C (carbonyl carbon)
#    :o = COOH の O(OH)
#    :b = Phenolic O
# ───────────────────────────────
MONOMER_MAP = {
    "A": "O[C:2]c1ccc([O:3])cc1C(=O)[O:1]",   # 4‑hydroxy‑benzoic acid
    "B": "[O:3]c1ccc([O:4])cc1",              # hydroquinone
    "C": "O[C:2]C1=CC=C(C=C1)C(=O)[O:1]",     # terephthalic acid
    "D": "O[C:2]C1=CC=CC(C(=O)[O:1])=C1"      # isophthalic acid
}

# a = map 1‑(C=O) と 2‑(O=C‑OH) の *酸* 部位
# b = map 3 or 4 の *フェノール O* 部位
# 末端置換のため、酸の ‑OH の酸素は map#:1、フェノール O は map#:3/4

# ───────────────────────────────
# 1. 反応 SMARTS （エステル形成, 水除去）
RXNS = [
    AllChem.ReactionFromSmarts("[C:1](=[O:2])[OH:3].[OH:4][C:5]>>[C:1](=[O:2])[O:4][C:5]"),
    AllChem.ReactionFromSmarts("[OH:4][C:5].[C:1](=[O:2])[OH:3]>>[C:1](=[O:2])[O:4][C:5]")
]

# ───────────────────────────────
def monomer_mol(tag: str) -> Chem.Mol:
    """ラベル付き SMILES → Mol（キープアトムマップ）"""
    m = Chem.MolFromSmiles(MONOMER_MAP[tag], sanitize=False)
    Chem.SanitizeMol(m)
    return m

# ───────────────────────────────
def condense_pair(left: Chem.Mol, right: Chem.Mol) -> Chem.Mol:
    for rxn in RXNS:
        prod_sets = rxn.RunReactants((left, right))
        if prod_sets:
            prod = prod_sets[0][0]
            Chem.SanitizeMol(prod)
            return prod
    raise RuntimeError("Esterification failed")

# ───────────────────────────────
# --- replace_terminal_atoms 修正版 ---
def replace_terminal_atoms(mol: Chem.Mol) -> Chem.Mol:
    rw = Chem.RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() != 8:          # Oxygen only
            continue
        if atom.GetTotalDegree() != 1:
            continue
        bond = list(atom.GetBonds())[0]
        if bond.GetBondType() != Chem.BondType.SINGLE:
            continue                          # ← DOUBLE を除外
        nbr = bond.GetOtherAtom(atom)
        # フェノール
        if nbr.GetIsAromatic():
            rw.ReplaceAtom(atom.GetIdx(), Chem.Atom(0)); continue
        # 酸 (C(=O)O‑H)
        if nbr.GetAtomicNum() == 6 and any(
              b.GetBondType() == Chem.BondType.DOUBLE and
              b.GetOtherAtom(nbr).GetAtomicNum() == 8 for b in nbr.GetBonds()):
            rw.ReplaceAtom(atom.GetIdx(), Chem.Atom(0))
    Chem.SanitizeMol(rw)
    return rw.GetMol()


# ───────────────────────────────
def pattern_to_smiles(pattern: str) -> str:
    """例: a-A-b-a-A-b → SMILES (末端[*] 置換済)"""
    tokens = pattern.split("-")
    # 1 モノマー目
    m = monomer_mol(tokens[1])    # tokens = [a, A, b, ...]
    i = 2  # index 指す位置 (= b)
    while i + 2 < len(tokens):
        # tokens[i] = 'b', tokens[i+1] = 'a', tokens[i+2] = monomer
        next_monomer = monomer_mol(tokens[i+2])
        m = condense_pair(m, next_monomer)
        i += 3
    m = replace_terminal_atoms(m)
    return Chem.MolToSmiles(m,
                            isomericSmiles=False,
                            canonical=False)


# ───────────────────────────────
if __name__ == "__main__":
    SAMPLE = [
        "a-A-b-a-A-b",
        "a-C-a-b-B-b",
        "a-A-b-a-C-a-b-B-b",
        "b-A-a-b-A-a",
    ]
    for p in SAMPLE:
        try:
            print(f"{p:30s} → {pattern_to_smiles(p)}")
        except RuntimeError as e:
            print(f"{p:30s} → Error: {e}")


a-A-b-a-A-b                    → Error: Esterification failed
a-C-a-b-B-b                    → Error: Esterification failed
a-A-b-a-C-a-b-B-b              → Error: Esterification failed
b-A-a-b-A-a                    → Error: Esterification failed


[01:08:09] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
[01:08:09] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
