# 8_binary_SE_TI_Target.ipynb
Create a file with Path and SE/TI linked.
### input
- 3_Calc_Edit_Distance/output/KEGG_ID_index.csv : A file with Paths, Path IDs and Pathway map ID associated.
- 2_Paths_from_KEGG_Pathway_datafile/Paths_from_KEGG_Pathway.csv : A file with Path and Target associated. In this notebook, we only used the column 'KEGG_id' and 'HSA' in this file.
- 7_Target_Drug_SE_TI_from_SIDER_DrugBank/tbl_drug_target_TI_SE.pkl : A file with Drug and Target and TI/SE associated. In this notebook, we only used the column 'entry_id', 'SE' and 'TI' in this file.
- 7_Target_Drug_SE_TI_from_SIDER_DrugBank/Use_SE_177.csv : A SE list using for training LGBM.

### output
- 9_Integration_SE_TI_Target_datafile/Y_binary_SE.npz : A file with Path ID and SE linked.
- 9_Integration_SE_TI_Target_datafile/Y_binary_TI.npz : A file with Path ID and TI linked.
- 9_Integration_SE_TI_Target_datafile/Y_ID_name_SE.csv : A file with SE ID and SE name linked.
- 9_Integration_SE_TI_Target_datafile/Y_ID_name_TI.csv : A file with TI ID and TI name linked.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import save_npz
from scipy.sparse import csr_matrix, coo_matrix

In [2]:
df_order = pd.read_csv('../3_Calc_Edit_Distance/output/KEGG_ID_index.csv',header = 0, index_col=0).drop(columns = 'hsa_map')

In [3]:
df_t = pd.read_csv('../2_Paths_from_KEGG_Pathway_datafile/Paths_from_KEGG_Pathway.csv',header = 0, index_col=0).drop(columns = ['hsa_map', 'order1']).drop_duplicates().reset_index(drop = True)

In [4]:
df = pd.merge(df_order, df_t, on = 'KEGG_id')

In [5]:
df

Unnamed: 0,index,KEGG_id,HSA
0,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,3065
1,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,3066
2,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,55869
3,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,10013
4,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,9759
...,...,...,...
94772,67476,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",9453
94773,67477,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",9453
94774,67478,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",9453
94775,67479,"['hsa:9453', 'path:map00909', 'path:hsa00100']",9453


In [6]:
import pickle
with open('../7_Target_Drug_SE_TI_from_SIDER_DrugBank/tbl_drug_target_TI_SE.pkl', 'rb') as web:
    df_SE_TI = pickle.load(web)

In [7]:
df_SE_TI['HSA'] = df_SE_TI['entry_id'].str.replace('hsa:', '').apply(int)

In [8]:
df_SE = df_SE_TI[['SE', 'HSA']].drop_duplicates()
df_TI = df_SE_TI[['TI', 'HSA']].drop_duplicates()

In [9]:
df_TI['TI'] = df_TI['TI'].apply(eval)
df_SE['SE'] = df_SE['SE'].apply(eval)

In [10]:
df_use = pd.read_csv('../7_Target_Drug_SE_TI_from_SIDER_DrugBank/Use_SE_177.csv',header = 0)

In [11]:
df_use

Unnamed: 0,SE
0,Heart rate abnormal
1,Anaplastic thyroid cancer
2,Breast cancer female
3,Breast cancer invasive NOS
4,Cardiac discomfort
...,...
172,Renal impairment
173,Hepatic function abnormal
174,Renal failure acute
175,Liver function test abnormal


In [12]:
df_SE = pd.merge(df_SE.explode('SE'), df_use, on = 'SE').groupby('HSA')['SE'].apply(list).reset_index()

In [13]:
df_SE

Unnamed: 0,HSA,SE
0,2,"[Renal failure acute, Renal failure, Hepatocel..."
1,18,"[Hepatocellular injury, Suicide attempt, Memor..."
2,19,"[Renal failure acute, Hepatitis cholestatic, H..."
3,25,"[Cardiac arrest, Liver injury, Liver injury, A..."
4,26,"[Liver disorder, Hepatic function abnormal, He..."
...,...,...
539,133522,"[Cardiac arrest, Liver function test abnormal,..."
540,134864,[Sudden death]
541,150094,"[Gingival bleeding, Renal failure acute, Renal..."
542,203068,"[Cardiac arrest, Gingival bleeding, Renal fail..."


In [14]:
df_TI

Unnamed: 0,TI,HSA
0,"[Multiple sclerosis, Walking disability]",3739
1,"[Multiple sclerosis, Walking disability]",3748
2,"[Multiple sclerosis, Walking disability]",3751
3,"[Arrhythmia, Bundle branch block, Sudden death...",3751
4,"[Alcoholism, Depression, Endogenous depression...",3751
...,...,...
2743,"[Hepato-lenticular degeneration, Liver disorder]",7161
2744,"[Epilepsy, Partial seizures, Sudden unexplaine...",377677
2745,"[Epilepsy, Partial seizures, Sudden unexplaine...",11238
2746,"[Epilepsy, Partial seizures, Sudden unexplaine...",767


In [15]:
df_s = pd.merge(df, df_SE, on = 'HSA').drop(columns = 'HSA')

In [16]:
df_s1 = df_s.groupby(['KEGG_id', 'index'])['SE'].sum().reset_index().drop(columns = 'KEGG_id').set_index(['index'])

In [17]:
df_s_all = pd.get_dummies(df_s1['SE'].apply(pd.Series).stack()).sum(level=0)

In [18]:
df_s_all = pd.merge(pd.DataFrame(range(67481), columns = ['index']), df_s_all.reset_index(), on = 'index', how = 'left').fillna(0).set_index('index')

In [19]:
df_s_all

Unnamed: 0_level_0,Acute hepatic failure,Acute liver damage,Acute liver injury,Acute renal insufficiency,Adrenal haemorrhage,Adrenal insufficiency,Advanced breast cancer,Allergic hepatitis,Anaplastic thyroid cancer,Auditory and visual hallucinations,...,"Sudden death, cause unknown",Sudden infant death syndrome,Sudden unexplained death in epilepsy,Suicidal behaviour,Suicidal ideation,Suicidal tendency,Suicide,Suicide attempt,Uterine cancer,Venoocclusive liver disease
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67476,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67477,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67478,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67479,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_t = pd.merge(df, df_TI, on = 'HSA').drop(columns = 'HSA')

In [21]:
df_t1 = df_t.groupby(['KEGG_id', 'index'])['TI'].sum().reset_index().drop(columns = 'KEGG_id').set_index(['index'])

In [22]:
df_t_all = pd.get_dummies(df_t1['TI'].apply(pd.Series).stack()).sum(level=0)

In [23]:
df_t_all

Unnamed: 0_level_0,ADHF,AV nodal reentrant tachycardia,AV reentrant tachycardia,Abdominal aortic aneurysm,Abdominal bloating,Abdominal cramps,Abdominal distension,Abdominal distress,Abdominal pain,Abnormal behaviour,...,Wernicke's encephalopathy,White blood cell count decreased,Withdrawal symptom,Withdrawal syndrome,Wolff-Parkinson-White syndrome,Worry,Wound sepsis,Xanthoma,Xeroderma pigmentosum,Zollinger-Ellison syndrome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
matrix_all_s = coo_matrix(df_s_all)
shape_se = matrix_all_s.toarray().shape
se_binary = csr_matrix((np.ones(len(matrix_all_s.data)), (matrix_all_s.row, matrix_all_s.col)), shape = shape_se)

In [25]:
print(f'Size of the matrix of response variables in SE: {shape_se}')

Size of the matrix of response variables in SE: (67481, 177)


In [26]:
save_npz('../9_Integration_SE_TI_Target_datafile/Y_binary_SE.npz', se_binary)

In [27]:
matrix_all_t = coo_matrix(df_t_all)
shape_ti = matrix_all_t.toarray().shape
ti_binary = csr_matrix((np.ones(len(matrix_all_t.data)), (matrix_all_t.row, matrix_all_t.col)), shape = shape_ti)

In [28]:
print(f'Size of the matrix of response variables in TI: {shape_ti}')

Size of the matrix of response variables in TI: (67481, 1684)


In [29]:
save_npz('../9_Integration_SE_TI_Target_datafile/Y_binary_TI.npz', ti_binary)

In [30]:
se_name = pd.DataFrame(df_s_all.columns).rename(columns = {0:'SE_name'})
se_name.to_csv('../9_Integration_SE_TI_Target_datafile/Y_ID_name_SE.csv',encoding = 'utf-8')

In [31]:
ti_name = pd.DataFrame(df_t_all.columns).rename(columns = {0:'TI_name'})
ti_name.to_csv('../9_Integration_SE_TI_Target_datafile/Y_ID_name_TI.csv',encoding = 'utf-8')