In [1]:
%load_ext autoreload
%autoreload 2

# 1. 環境変数取得
.env.sampleをコピーして.envを作成し、各ファイルへのパスを記載

In [2]:
import os
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()
TEMU_FILE = Path(os.environ.get('TEMU_FILE'))
MEDUTX_FILE = Path(os.environ.get('MEDUTX_FILE'))
MESHD_FILE = Path(os.environ.get('MESHD_FILE'))
MESHC_FILE = Path(os.environ.get('MESHC_FILE'))

---

# 2. 辞書読み込み
*ここで"辞書"は日英辞書など一般的な意味での辞書を意味し、Pythonデータ構造のdictを意味しないことに注意  
utils.data.Dictionaryクラスを継承すれば任意の辞書を追加可能

In [3]:
from utils.dictionary import MEDUTX, TEMU

In [4]:
temu = TEMU()
temu.read(TEMU_FILE)
print(temu)

dictionary name: temu
-- 27,668 word pairs
---- 10,738 unique English words	
---- 16,756 unique Japanese words	


In [5]:
medutx = MEDUTX()
medutx.read(MEDUTX_FILE)
print(medutx)

dictionary name: medutx
-- 27,122 word pairs
---- 22,276 unique English words	
---- 21,821 unique Japanese words	


## 2-1. それぞれの日英辞書、英日辞書を取得

In [6]:
temu_ja = temu.ja
temu_en = temu.en
medutx_ja = medutx.ja
medutx_en = medutx.en

In [7]:
# smaple
num = 3

print('# TEMU ja sample')
for i, (k, v) in enumerate(temu_ja.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# TEMU en sample')
for i, (k, v) in enumerate(temu_en.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# MEDUTX ja sample')
for i, (k, v) in enumerate(medutx_ja.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# MEDUTX en sample')
for i, (k, v) in enumerate(medutx_en.items()):
    if i >= num:
        break
    print(k, v)

# TEMU ja sample
1回仕事量 {'names_en': {'stroke work'}, 'mesh_ids': set()}
1回心拍出係数 {'names_en': {'SI (stroke index)', 'SVI (stroke volume index)'}, 'mesh_ids': set()}
1回心拍出量 {'names_en': {'SV (stroke volume)'}, 'mesh_ids': set()}

# TEMU en sample
11-beta-hydroxylase deficiency {'names_ja': {'１１βヒドロキシラーゼ欠損症'}, 'mesh_ids': set()}
17 ketosteroids urine {'names_ja': {'尿中１７ケトステロイド'}, 'mesh_ids': set()}
17 ketosteroids urine decreased {'names_ja': {'尿中１７ケトステロイド減少'}, 'mesh_ids': set()}

# MEDUTX ja sample
 受身移入 {'names_en': {'passive transfer'}, 'mesh_ids': set()}
 心室全体の駆出分画 {'names_en': {'global ejection fraction'}, 'mesh_ids': set()}
 振子様心 {'names_en': {'cor pendulum'}, 'mesh_ids': set()}

# MEDUTX en sample
1/2 T vector {'names_ja': {'1/2Tベクトル'}, 'mesh_ids': set()}
1/2FF {'names_ja': {'1/2拡張分画'}, 'mesh_ids': set()}
1/3 ER mean {'names_ja': {'駆出早期1/3での平均駆出速度'}, 'mesh_ids': set()}


## 2-2. TEMU辞書とMEDUTX辞書を結合
utils.data.combine関数に辞書のリストを代入することで結合した辞書を取得

In [8]:
from utils.dictionary import combine_dict

combined_dict = combine_dict([temu, medutx])
print(combined_dict)

dictionary name: temu + medutx
-- 54,790 word pairs
---- 30,853 unique English words	
---- 35,903 unique Japanese words	


---

# 3. MeSH読み込み

In [9]:
from utils.mesh import MeshDescriptor, MeshSupplementary

In [10]:
meshd = MeshDescriptor()
meshd.read(MESHD_FILE)
print(meshd)

dictionary name: MeSH Descriptor
-- 242,205 unique MeSH terms	
-- 29,640 unique MeSH IDs	


In [11]:
meshc = MeshSupplementary()
meshc.read(MESHC_FILE)
print(meshc)

dictionary name: MeSH Supplementary
-- 649,372 unique MeSH terms	
-- 268,838 unique MeSH IDs	


## 3-1. それぞれのID-term辞書、term-ID辞書を取得

In [12]:
meshd_id2terms = meshd.id2terms
meshd_term2ids = meshd.term2ids
meshc_id2terms = meshc.id2terms
meshc_term2ids = meshc.term2ids

In [13]:
# smaple
num = 3

print('# MeHS Descriptor id2terms sample')
for i, (k, v) in enumerate(meshd_id2terms.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# MeSH Descriptor term2ids sample')
for i, (k, v) in enumerate(meshd_term2ids.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# MeSH Supplementary id2terms sample')
for i, (k, v) in enumerate(meshc_id2terms.items()):
    if i >= num:
        break
    print(k, v)
print()

print('# MeSH Supplementary term2ids sample')
for i, (k, v) in enumerate(meshc_term2ids.items()):
    if i >= num:
        break
    print(k, v)


# MeHS Descriptor id2terms sample
D000001 {'A-23187', 'A 23187', 'A23187', 'A23187, Antibiotic', 'Antibiotic A23187', 'Calcimycin'}
D000002 {'Temefos', 'Difos', 'Abate', 'Temephos'}
D000003 {'House, Slaughter', 'Slaughterhouses', 'Slaughterhouse', 'Abattoirs', 'Slaughter House', 'Slaughter Houses', 'Abattoir', 'Houses, Slaughter'}

# MeSH Descriptor term2ids sample
A-23187 {'D000001'}
Calcimycin {'D000001'}
A23187 {'D000001'}

# MeSH Supplementary id2terms sample
C000002 {'piribenzil methyl sulfate', '2-(hydroxymethyl)-N,N-dimethylpiperidinium benzilate', 'Acabel', 'bevonium sulfate (1:1)', 'bevonium methyl sulfate', 'bevonium', 'CG 201', 'bevonium metilsulfate', 'bevonium methylsulfate'}
C000006 {'Actrapid insulin', 'insulin (swine)', 'Novo MC insulin', 'insulin, neutral', 'insulin (ox), 8(A)-L-threonine-10(A)-L-isoleucine-', 'insulin (pork)', 'neutral insulin', 'insulin pork', 'Insulin, 8A-L-threonine-10A-L-isoleucine-'}
C000009 {'AADG', 'N(4)-(2-acetamido-2-deoxy-beta-D-glucopyranos

## 3-2. MeSH DiscriptorとMeSH Supplementaryを結合
utils.data.combine関数に辞書のリストを代入することで結合した辞書を取得

In [17]:
from utils.mesh import combine_mesh

combined_mesh = combine_mesh([meshd, meshc])
print(combined_mesh)

dictionary name: MeSH Descriptor + MeSH Supplementary
-- 891,576 unique MeSH terms	
-- 298,478 unique MeSH IDs	


---

# 辞書とMeSH辞書をリンク
Dictionaryクラスのmap_meshメソッドにMeSH辞書を代入することで英単語の文字列が一致するかどうかでMeSH IDを紐づける

In [24]:
combined_dict.map_mesh(combined_mesh)

1,781/30,853 English words are mapped to MeSH IDs
2,838/35,903 Japanese words are mapped to MeSH IDs


In [25]:
# 英語から検索
print(combined_dict.en['Miscarriage'])

# 日本語から検索
print(combined_dict.ja['麻酔薬'])

{'names_ja': {'自然流産'}, 'mesh_ids': {'D000022'}}
{'names_en': {'anesthetic', 'Anesthesia', 'anesthetic drug', 'anesthetic agent', 'narcotic'}, 'mesh_ids': {'D000758'}}


---

# chain-norm
文字列の変換ルールを連続適用するパッケージ

In [26]:
# Ruleクラスを継承した以下のクラスのapply()メソッドに文字列を代入することで変換
from cnorm.rule import Lower, NormalizeJaConv, RomNum2AraNum, Greek2Alpha, Word2Num

## 小文字化
rule_lower = Lower()
text = rule_lower.apply('FOO-BAR')
print(text)

## jaconvを使った正規化（デフォルトはNFKC）
rule_njac = NormalizeJaConv()
text = rule_njac.apply('ﾊﾝｶｸ/ゼンカク/HALFWIDTH/ＦＵＬＬＷＩＤＴＨ')
print(text)

## ローマ数字をアラビア数字に変換
rule_rn2an = RomNum2AraNum()
text = rule_rn2an.apply('Glycogen storage disease type Ⅳ')
print(text)

## ギリシャ文字をアルファベットに変換
rule_g2a = Greek2Alpha()
text = rule_g2a.apply('α-glucosidase')
print(text)

## 数値表現をアラビア数字に変換
rule_w2n = Word2Num()
text = rule_w2n.apply('two hundred apples')
print(text)

foo-bar
ハンカク/ゼンカク/HALFWIDTH/FULLWIDTH
Glycogen storage disease type four
alpha-glucosidase
200 apples


In [27]:
# 連続適用するにはChainクラスにList[Rule]を渡す
from cnorm.data import Chain

# Chainクラスのapply()メソッドで変換したテキストが返ってくる
# *適用するRuleの順番によって結果が変わるので注意

# NormalizeJaConvが"Ⅳ"を"IV"に変換するのでRomNum2AraNumが効かなくなる
chain = Chain([rule_njac, rule_rn2an, rule_w2n])
text = chain.apply('type Ⅳ')
print(text)

# RomNum2AraNumが先に"Ⅳ"を"four"に変換、Word2Numが"four"を"4"に変換
chain = Chain([rule_rn2an, rule_w2n, rule_njac])
text = chain.apply('type Ⅳ')
print(text)

type IV
type 4


---

# 辞書とMeSH辞書をリンク with 文字列処理
Dictionaryクラスのmap_meshメソッドにMeSH辞書とChainオブジェクトを代入することで文字列処理を加えた上で英単語の文字列が一致するかどうか判定する（TODO: ChainがもつRuleの数に応じて時間がかかるので高速化）

In [32]:
# 文字列処理なし
chain = Chain([])
combined_dict.map_mesh(combined_mesh, chain)
print(combined_dict.en['Acacia'])
print(combined_dict.en['acacia'])
print(combined_dict.ja['アラビアゴム'])
print(combined_dict.ja['アカシア'])

1,781/30,853 English words are mapped to MeSH IDs
2,838/35,903 Japanese words are mapped to MeSH IDs
{'names_ja': {'アラビアゴム'}, 'mesh_ids': {'D000045'}}
{'names_ja': {'アカシア'}, 'mesh_ids': set()}
{'names_en': {'Acacia'}, 'mesh_ids': {'D000045'}}
{'names_en': {'acacia'}, 'mesh_ids': set()}


In [34]:
# 小文字化
# ヒット数増加
chain = Chain([rule_lower])
combined_dict.map_mesh(combined_mesh, chain)
print(combined_dict.en['Acacia'])
print(combined_dict.en['acacia'])
print(combined_dict.ja['アラビアゴム'])
print(combined_dict.ja['アカシア'])

8,867/30,853 English words are mapped to MeSH IDs
12,406/35,903 Japanese words are mapped to MeSH IDs
{'names_ja': {'アラビアゴム'}, 'mesh_ids': {'D000045'}}
{'names_ja': {'アカシア'}, 'mesh_ids': {'D000045'}}
{'names_en': {'Acacia'}, 'mesh_ids': {'D000045'}}
{'names_en': {'acacia'}, 'mesh_ids': {'D000045'}}


In [35]:
# jaconvで正規化（NFKC）
# ほとんど変化なし
chain = Chain([rule_njac])
combined_dict.map_mesh(combined_mesh, chain)

1,782/30,853 English words are mapped to MeSH IDs
2,839/35,903 Japanese words are mapped to MeSH IDs


In [36]:
# ローマ数字をアラビア数字に変換
# 変化なし
# 処理時間: 1.5min
chain = Chain([rule_rn2an])
combined_dict.map_mesh(combined_mesh, chain)

1,781/30,853 English words are mapped to MeSH IDs
2,838/35,903 Japanese words are mapped to MeSH IDs
CPU times: user 1min 23s, sys: 68.1 ms, total: 1min 23s
Wall time: 1min 23s


In [37]:
# ギリシャ文字をアルファベットに変換
# ほとんど変化なし
# 処理時間: 8.5min
chain = Chain([rule_g2a])
combined_dict.map_mesh(combined_mesh, chain)

1,790/30,853 English words are mapped to MeSH IDs
2,857/35,903 Japanese words are mapped to MeSH IDs
CPU times: user 8min 29s, sys: 567 ms, total: 8min 30s
Wall time: 8min 30s


In [None]:
# 小文字化, 正規化, ギリシャ文字の変換の３つのRuleを適用
# 処理時間: 10min
chain = Chain([rule_lower, rule_njac, rule_g2a])
combined_dict.map_mesh(combined_mesh, chain)

# TODO
LSDと比較