1. hp.oboからMICA行列を作る
2. phenotype.hpoaを祖先タームとも関連づけする
3. 2つのタームのレスニック類似度を求める
4. 既知の疾患と患者さんの症状の類似度を求める

## 1. hp.obo から MICA 行列を作る

In [1]:
# ライブラリ読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pronto
import copy
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from graphviz import Digraph
from IPython.display import Image

In [2]:
# 'mini_hp.obo'読み込み
ont = pronto.Ontology('mini_hp.obo')

# ont = pronto.Ontology('https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo')

  ont = pronto.Ontology('mini_hp.obo')


In [3]:
len(ont)

5

In [4]:
# データフレームを使う

from itertools import product # 全ての組み合わせ（直積）を求めるライブラリ
import pprint # printを見やすい表示にするライブラリ

id_list = ['HP:0001','HP:0002','HP:0003','HP:0004','HP:0005']

# id_listの全ての要素（5つ)同士を組み合わせてペアにする
ont_pair_df = pd.DataFrame(list(product(id_list, id_list)),
                      columns=['id_1', 'id_2']) # pd.DataFrame(cartesian())でもできる
ont_pair_df # 25個

Unnamed: 0,id_1,id_2
0,HP:0001,HP:0001
1,HP:0001,HP:0002
2,HP:0001,HP:0003
3,HP:0001,HP:0004
4,HP:0001,HP:0005
5,HP:0002,HP:0001
6,HP:0002,HP:0002
7,HP:0002,HP:0003
8,HP:0002,HP:0004
9,HP:0002,HP:0005


In [5]:
# 列の各要素のスーパークラスタームを抽出して順序付き集合にする。
# 順序付き集合の積の1番目が直近の共通祖先idとなる


from orderedset import OrderedSet # 順序付き集合用ライブラリ

# 以下のリストを作成

sup_id_1_lst=[]
sup_id_2_lst=[]
most_recent_sup_list=[]


for id_1, id_2 in zip(ont_pair_df['id_1'], ont_pair_df['id_2']):
    
    # 列の各要素(id)が持つスーパークラスタームを抽出
    sup_id_1_lst_value =  list(ont[id_1].superclasses())
    sup_id_2_lst_value =  list(ont[id_2].superclasses())
       
    # 抽出したタームを順序付き集合にする
    orderset_sup_id_1_lst_value = OrderedSet(sup_id_1_lst_value)
    orderset_sup_id_2_lst_value = OrderedSet(sup_id_2_lst_value)
    
    # 順序付き集合の積で、共通の祖先タームを出す
    most_recent_sup_value = orderset_sup_id_1_lst_value & orderset_sup_id_2_lst_value
    
    # リストに追加していく
    sup_id_1_lst.append(sup_id_1_lst_value) # 列「id_1」のスーパークラスタームリスト
    sup_id_2_lst.append(sup_id_2_lst_value) # 列「id_2」のスーパークラスタームリスト
    most_recent_sup_list.append(most_recent_sup_value) # 2つの順序付き集合を合わせた共通の祖先タームリスト
    

In [6]:
# 直近の共通祖先タームを求める

target_id_list=[] 

for target_id_index, target_id in enumerate(most_recent_sup_list):
    
    target_id_list.append(target_id[0].id) # 集合の積の一番目=直近の共通祖先id

In [7]:
# データフレームに列「直近の共通祖先ターム」を追加

ont_pair_df['直近の共通の祖先ターム'] = pd.DataFrame(target_id_list)
ont_pair_df = ont_pair_df[['id_1', 'id_2', '直近の共通の祖先ターム']]
ont_pair_df

Unnamed: 0,id_1,id_2,直近の共通の祖先ターム
0,HP:0001,HP:0001,HP:0001
1,HP:0001,HP:0002,HP:0001
2,HP:0001,HP:0003,HP:0001
3,HP:0001,HP:0004,HP:0001
4,HP:0001,HP:0005,HP:0001
5,HP:0002,HP:0001,HP:0001
6,HP:0002,HP:0002,HP:0002
7,HP:0002,HP:0003,HP:0001
8,HP:0002,HP:0004,HP:0002
9,HP:0002,HP:0005,HP:0002


In [8]:
# データフレームをユニークな要素を両軸にとった構造に変換する

mica_term_ance_comm = pd.DataFrame(index = ont_pair_df["id_1"].unique().tolist(),
                                   columns = ont_pair_df["id_2"].unique().tolist() )
mica_term_ance_comm

Unnamed: 0,HP:0001,HP:0002,HP:0003,HP:0004,HP:0005
HP:0001,,,,,
HP:0002,,,,,
HP:0003,,,,,
HP:0004,,,,,
HP:0005,,,,,


In [9]:
#  ont_pair_dfから「直近の共通の祖先ターム」データを持ってくる
# index 組み合わせ_sup_id_HP_1
# columns 組み合わせ_sup_id_HP_2

for columns_hp_term in ont_pair_df["id_2"].unique().tolist():
    for index_hp_term in ont_pair_df["id_1"].unique().tolist():
        
        #  列「組み合わせ_sup_id_HP_1」がcolumns_hp_termで、かつ
        #  列「組み合わせ_sup_id_HP_2」index_hp_termである時の
        #　["直近の共通の祖先ターム"]の値を取り出して
        #  データフレームにする（インデックス＝index_hp_term、列＝columns_hp_term）      
        mica_term_ance_comm.loc[index_hp_term, columns_hp_term] = ont_pair_df[(ont_pair_df["id_2"] == columns_hp_term) & (ont_pair_df["id_1"] == index_hp_term)]["直近の共通の祖先ターム"].values[0]


print('↓index　：id_1')
print('→columns：id_2') 

mica_term_ance_comm

# 別法で確認できる(ピボットしか思いつかず。groupby)
# ont_pair_df.pivot(index='id_1', columns='id_2', values='直近の共通の祖先ターム')

↓index　：id_1
→columns：id_2


Unnamed: 0,HP:0001,HP:0002,HP:0003,HP:0004,HP:0005
HP:0001,HP:0001,HP:0001,HP:0001,HP:0001,HP:0001
HP:0002,HP:0001,HP:0002,HP:0001,HP:0002,HP:0002
HP:0003,HP:0001,HP:0001,HP:0003,HP:0001,HP:0001
HP:0004,HP:0001,HP:0002,HP:0001,HP:0004,HP:0002
HP:0005,HP:0001,HP:0002,HP:0001,HP:0002,HP:0005


In [10]:
# # Jupyter での画像挿入
# <img src="MICA_array_and_IC_made_from_mini_ontology_1.bmp">

In [11]:
#[This link](https://github.com/sayako-osakabe-20210120/Ontology/blob/main/MICA_array_and_IC_made_from_mini_ontology_1.bmp)

![This link](MICA_array_and_IC_made_from_mini_ontology_1.bmp)

![This link](MICA_array_and_IC_made_from_mini_ontology_2.bmp)

![This link](MICA_array_and_IC_made_from_mini_ontology_3.bmp)

![This link](MICA_array_and_IC_made_from_mini_ontology_4.bmp)

## 2. phenotype.hpoaを祖先タームとも関連づけする

In [12]:
# 'mini_phenotype.hpoa'読み込み
mini_phenotype_df = pd.read_csv("mini_phenotype.hpoa", header=4, sep="\t", engine="python")

# 列名#DatabaseIDの#をとる
mini_phenotype_df = mini_phenotype_df.rename(columns={'#DatabaseID': "DatabaseID"})

#DatabaseID が OMIM を含むものに絞り、使う列だけ残す
mini_phenotype_df = mini_phenotype_df[mini_phenotype_df["DatabaseID"].str.contains("OMIM")]
mini_phenotype_df = mini_phenotype_df[['DatabaseID','DiseaseName','HPO_ID']]

In [13]:
# 計算用のデータフレームを作っておく

sample_phenotype_df = pd.DataFrame(index=mini_phenotype_df["DiseaseName"].unique().tolist(), columns=id_list)
sample_phenotype_df
#df.fillna(0, inplace=True)

Unnamed: 0,HP:0001,HP:0002,HP:0003,HP:0004,HP:0005
Daitou shou,,,,,
Shoutou shou,,,,,
Chiteki shougai,,,,,


In [14]:
# mini_phenotype_dfから「病名,hp_id」のペアリストを作る
pair_from_phenotype_list = []

for name_from_phenotype, id_from_phenotype in zip(mini_phenotype_df["DiseaseName"], mini_phenotype_df["HPO_ID"]):
    pair_from_phenotype_list.append([name_from_phenotype, id_from_phenotype])
#    name_from_phenotype_list.append(name_from_phenotype)
#    id_from_phenotype_list.append(id_from_phenotype)

In [15]:
# mini_phenotypeで病名に紐づけられているIDのスーパークラスタームをontから取り出す

sup_of_name_from_phenotype =[]

for i in range(len(pair_from_phenotype_list)):
    
    # 動的に変数を作成、execでpython文の実行
    exec("sup_of_name_" + str(i) + "_from_phenotype = list(ont[pair_from_phenotype_list[" +str(i)+ "][1]].superclasses())")
    sup_of_name_from_phenotype.append("sup_of_name_" + str(i) + "_from_phenotype")


In [16]:
sup_of_name_from_phenotype

['sup_of_name_0_from_phenotype',
 'sup_of_name_1_from_phenotype',
 'sup_of_name_2_from_phenotype',
 'sup_of_name_3_from_phenotype']

In [17]:
for element_hpo_tmp,sup_name_phenotype in zip(pair_from_phenotype_list,sup_of_name_from_phenotype):
    for element_sup_term in eval(sup_name_phenotype):
        sample_phenotype_df.loc[element_hpo_tmp[0]][element_sup_term.id]=1

In [18]:
sample_phenotype_df

Unnamed: 0,HP:0001,HP:0002,HP:0003,HP:0004,HP:0005
Daitou shou,1,1.0,,1.0,
Shoutou shou,1,1.0,1.0,,1.0
Chiteki shougai,1,,1.0,,


In [19]:
all_unique_name_count = mini_phenotype_df["DiseaseName"].nunique()

sample_phenotype_df.loc['IC'] = -np.log(sample_phenotype_df.sum()/all_unique_name_count)
# データフレームを使わない場合はimport math⇒ -math.log(  ,math.e)を使う
sample_phenotype_df

Unnamed: 0,HP:0001,HP:0002,HP:0003,HP:0004,HP:0005
Daitou shou,1,1.0,,1.0,
Shoutou shou,1,1.0,1.0,,1.0
Chiteki shougai,1,,1.0,,
IC,0,0.405465,0.405465,1.09861,1.09861


In [20]:
#[This link](https://github.com/sayako-osakabe-20210120/Ontology/blob/main/MICA_array_and_IC_made_from_mini_ontology_2.bmp)

In [21]:
#[This link](https://github.com/sayako-osakabe-20210120/Ontology/blob/main/MICA_array_and_IC_made_from_mini_ontology_3.bmp)

In [22]:
#[This link](https://github.com/sayako-osakabe-20210120/Ontology/blob/main/MICA_array_and_IC_made_from_mini_ontology_4.bmp)

In [23]:
#sample_phenotype_df

In [24]:
!git add .

The file will have its original line endings in your working directory
The file will have its original line endings in your working directory


In [25]:
!git commit

Aborting commit due to empty commit message.


In [26]:
!git commit -m "recommit" -v

[main a53e678] recommit
 2 files changed, 106 insertions(+), 20 deletions(-)


In [27]:
!git push

To https://github.com/sayako-osakabe-20210120/Ontology.git
   78d1a90..a53e678  main -> main
