### Language Model Setup

In [2]:
import openai
import os

openai.api_key =  os.getenv('OPENAI_API_KEY')

In [3]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo')

### Tokenizer Setup

In [4]:
import tiktoken 
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [5]:
tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=os.environ['OPENAI_API_KEY']
)

In [7]:
text_examples = ["I am a text example", 
                 "I am another text example"]

In [8]:
result = embed.embed_documents(text_examples)

In [9]:
len(result[0])

1536

### Importing Raw Datasets

In [10]:
kegg_medicus_drug = 'raw_datasets/kegg_medicus_drug_en.csv'
kegg_medicus_disease = 'raw_datasets/kegg_medicus_disease_en.csv'
kegg_medicus_dgroup = 'raw_datasets/kegg_medicus_dgroup_en.csv'
kegg_medicus_environ = 'raw_datasets/kegg_medicus_environ_en.csv'
kegg_medicus_network = 'raw_datasets/kegg_medicus_network.csv'
kegg_medicus_variant = 'raw_datasets/kegg_medicus_variant.csv'

In [11]:
import pandas as pd

raw_drug_df = pd.read_csv(kegg_medicus_drug)
raw_disease_df = pd.read_csv(kegg_medicus_disease)
raw_dgroup_df = pd.read_csv(kegg_medicus_dgroup)
raw_environ_df = pd.read_csv(kegg_medicus_environ)
raw_network_df = pd.read_csv(kegg_medicus_network)
raw_variant_df = pd.read_csv(kegg_medicus_variant)

### Data Cleaning & Preprocessing

In [12]:
# empty variable for preprocessing function
row_to_dict = None

In [13]:
preprocessed_datasets = []

In [14]:
def convert_dataframe_to_list(df):
    result_list = df.apply(row_to_dict, axis=1).tolist()
    return result_list

#### Kegg Medicus Drug Data Preprocessing

In [15]:
raw_drug_df.head()

Unnamed: 0,entry_id,classification,name,component,formula,exact_mass,mol_weight,image,sequence,sequence type,...,dblinks_w_link,comment_w_link,interaction_w_link,target_w_link,source_w_link,metabolism_w_link,sequence_w_link,component_w_link,class_w_link,raw_entry_id
0,D00001,Drug,Water (JP18/USP);\nPurified water (JP18);\nPur...,,H2O,18.0106,18.0153,D00001,,,...,"CAS<a href=""https://identifiers.org/cas:7732-1...",,,,,,,,,D00001
1,D00002,Drug,Nadide (JAN/USAN/INN);\nNicotinamide adenine d...,,C21H28N7O14P2,664.1169,664.433,D00002,,,...,"CAS<a href=""https://identifiers.org/cas:53-84-...",Antagonist (to alcohol and narcotics),,,,,,,,D00002
2,D00003,Drug,Oxygen (JP18/USP),,O2,31.9898,31.9988,D00003,,,...,"CAS<a href=""https://identifiers.org/cas:7782-4...",,,,,,,,,D00003
3,D00004,Drug,Carbon dioxide (JP18/USP);\nCarbon dioxide (TN),,CO2,43.9898,44.0095,D00004,,,...,"CAS<a href=""https://identifiers.org/cas:124-38...",,,,,,,,,D00004
4,D00005,Drug,Flavin adenine dinucleotide (JAN);\nAdeflavin ...,,C27H33N9O15P2,785.1571,785.5497,D00005,,,...,"CAS<a href=""https://identifiers.org/cas:146-14...",Coenzyme form of vitamin B2,,,,,,,,D00005


In [16]:
raw_drug_df.groupby('classification')['classification'].nunique()

classification
Crude Drug      1
Drug            1
Formula Drug    1
Mixture Drug    1
Name: classification, dtype: int64

In [17]:
raw_drug_df.isna().sum()

entry_id                  0
classification            0
name                      1
component             10645
formula                2296
exact_mass             2658
mol_weight             2658
image                     0
sequence              11227
sequence type         11225
source                11325
class                  6430
remark                 4109
efficacy                 24
disease               10516
comment                5738
target                 5507
metabolism            10733
interaction           11591
db_links                244
kcf                    3139
atom                   3139
bond                   3139
bracket               10980
original              10986
repeat                10986
disease_w_link        10522
remark_w_link          4109
efficacy_w_link       11938
dblinks_w_link          244
comment_w_link         5738
interaction_w_link     6208
target_w_link          5507
source_w_link         11325
metabolism_w_link     10733
sequence_w_link     

In [18]:
exclude = ['component', 'sequence', 'source', 'db_links', 'sequence type', 
           'dblinks_w_link', 'comment_w_link', 'target_w_link', 'disease', 'disease_w_link',
           'remark_w_link', 'image', 'raw_entry_id', 'class', 'original', 'repeat',
           'efficacy_w_link','source_w_link','metabolism_w_link','sequence_w_link',
            'interaction_w_link','component_w_link','class_w_link', 'interaction','kcf','atom', 'bond',
            'bracket', 'remark','metabolism','target']
clean_drug_df = raw_drug_df[[col for col in raw_drug_df.columns if col not in exclude]]


In [19]:
clean_drug_df.columns

Index(['entry_id', 'classification', 'name', 'formula', 'exact_mass',
       'mol_weight', 'efficacy', 'comment'],
      dtype='object')

In [20]:
clean_drug_df.head()

Unnamed: 0,entry_id,classification,name,formula,exact_mass,mol_weight,efficacy,comment
0,D00001,Drug,Water (JP18/USP);\nPurified water (JP18);\nPur...,H2O,18.0106,18.0153,Pharmaceutic aid (solvent),
1,D00002,Drug,Nadide (JAN/USAN/INN);\nNicotinamide adenine d...,C21H28N7O14P2,664.1169,664.433,Coenzyme,Antagonist (to alcohol and narcotics)
2,D00003,Drug,Oxygen (JP18/USP),O2,31.9898,31.9988,Medical gases,
3,D00004,Drug,Carbon dioxide (JP18/USP);\nCarbon dioxide (TN),CO2,43.9898,44.0095,Medical gases,
4,D00005,Drug,Flavin adenine dinucleotide (JAN);\nAdeflavin ...,C27H33N9O15P2,785.1571,785.5497,Supplement (vitamin B2),Coenzyme form of vitamin B2


In [21]:
#1
def row_to_dict(row):
    entry_id = row['entry_id']
    name = row['name']
    efficacy = row['efficacy']
    comment = row['comment']
    formula = row['formula']
    exact_mass = row['exact_mass']
    mol_weight = row['mol_weight']
    classification = row['classification']
    text = f"name: {name}; formula: {formula}; efficacy: {efficacy}; comment: {comment}"
    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "classification": classification,
                "formula": formula,
                "exact mass": exact_mass,
                "mol_weight": mol_weight
            }
        }

    return output_dict

In [22]:
drugs_preprocess = convert_dataframe_to_list(clean_drug_df)
print(len(drugs_preprocess))
drugs_preprocess[0]

11938


{'id': 'D00001',
 'input': 'name: Water (JP18/USP);\nPurified water (JP18);\nPurified water in containers (JP18);\nWater, purified (USP);\nSterile purified water in containers (JP18);\nWater for injection (JP18);\nWater for injection in containers (JP18);\nSterile water (TN); formula: H2O; efficacy: Pharmaceutic aid (solvent); comment: nan',
 'metadata': {'name': 'Water (JP18/USP);\nPurified water (JP18);\nPurified water in containers (JP18);\nWater, purified (USP);\nSterile purified water in containers (JP18);\nWater for injection (JP18);\nWater for injection in containers (JP18);\nSterile water (TN)',
  'text': 'name: Water (JP18/USP);\nPurified water (JP18);\nPurified water in containers (JP18);\nWater, purified (USP);\nSterile purified water in containers (JP18);\nWater for injection (JP18);\nWater for injection in containers (JP18);\nSterile water (TN); formula: H2O; efficacy: Pharmaceutic aid (solvent); comment: nan',
  'classification': 'Drug',
  'formula': 'H2O',
  'exact mass'

In [23]:
drugs_df = pd.DataFrame(drugs_preprocess)
preprocessed_datasets.append(drugs_df)
print(drugs_df.shape)
drugs_df.head()

(11938, 3)


Unnamed: 0,id,input,metadata
0,D00001,name: Water (JP18/USP);\nPurified water (JP18)...,{'name': 'Water (JP18/USP); Purified water (JP...
1,D00002,name: Nadide (JAN/USAN/INN);\nNicotinamide ade...,{'name': 'Nadide (JAN/USAN/INN); Nicotinamide ...
2,D00003,name: Oxygen (JP18/USP); formula: O2; efficacy...,"{'name': 'Oxygen (JP18/USP)', 'text': 'name: O..."
3,D00004,name: Carbon dioxide (JP18/USP);\nCarbon dioxi...,{'name': 'Carbon dioxide (JP18/USP); Carbon di...
4,D00005,name: Flavin adenine dinucleotide (JAN);\nAdef...,{'name': 'Flavin adenine dinucleotide (JAN); A...


#### Kegg Medicus Disease Data Preprocessing

In [24]:
raw_disease_df.head()

Unnamed: 0,ENTRY,NAME,SUBGROUP,SUPERGRP,DESCRIPTION,CATEGORY,NETWORK,GENE,PATHOGEN,ENV_FACTOR,...,DESCRIPTION_link,NETWORK_link,GENE_link,PATHOGEN_link,ENV_FACTOR_link,CARCINOGEN_link,DRUG_link,COMMENT_link,DBLINKS_link,REFERENCE_link
0,H00001,B-cell acute lymphoblastic leukemia;\n B-cell ...,Philadelphia chromosome positive acute lymphob...,Non-Hodgkin lymphoma [DS:H02418],Acute lymphocytic leukemia (ALL) is a clonal s...,Cancer,,BCR-ABL (translocation) [HSA:25] [KO:K06619]\n...,,,...,Acute lymphocytic leukemia (ALL) is a clonal s...,,"BCR-ABL (translocation) [HSA:<a href=""https://...",,,"*Thorium-232 and its decay products, administe...","Cyclophosphamide [DR:<a href=""https://www.geno...",,ICD-11: 2A70\n ICD-10: C83.5 C91.0\n MeSH: <a ...,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
1,H00002,T-cell acute lymphoblastic leukemia;\n T-cell ...,T-cell lymphoblastic lymphoma,,Acute lymphocytic leukemia (ALL) is a clonal s...,Cancer,,NOTCH1 (mutation) [HSA:4851] [KO:K02599]\n TAL...,,,...,Acute lymphocytic leukemia (ALL) is a clonal s...,,"NOTCH1 (mutation) [HSA:<a href=""https://www.ge...",,,"*Thorium-232 and its decay products, administe...","Cyclophosphamide [DR:<a href=""https://www.geno...",,ICD-11: 2A71\n ICD-10: C83.5 C91.0\n MeSH: <a ...,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
2,H00003,Acute myeloid leukemia,Acute promyelocytic leukemia [DS:H02542],,Acute myeloid leukemia (AML) is a disease that...,Cancer,nt06275 Acute myeloid leukemia,PML-RARalpha (translocation) [HSA:5371] [KO:K1...,,,...,Acute myeloid leukemia (AML) is a disease that...,"<a href=""https://www.kegg.jp/network/nt06275"">...","PML-RARalpha (translocation) [HSA:<a href=""htt...",,,"Benzene [CPD:<a href=""https://www.genome.jp/db...","Cyclophosphamide [DR:<a href=""https://www.geno...",,"ICD-11: 2A60\n ICD-10: C92.0\n MeSH: <a href=""...","PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
3,H00004,Chronic myeloid leukemia,,,Chronic myeloid leukemia (CML) is a clonal mye...,Cancer,nt06276 Chronic myeloid leukemia,BCR-ABL (translocation) [HSA:613 25] [KO:K0887...,,,...,Chronic myeloid leukemia (CML) is a clonal mye...,"<a href=""https://www.kegg.jp/network/nt06276"">...","BCR-ABL (translocation) [HSA:<a href=""https://...",,,"1,3-Butadiene [CPD:<a href=""https://www.genome...","Busulfan [DR:<a href=""https://www.genome.jp/db...",,ICD-11: 2A20.0\n ICD-10: C92.1\n MeSH: <a href...,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
4,H00005,Chronic lymphocytic leukemia,,Non-Hodgkin lymphoma [DS:H02418],Chronic lymphocytic leukemia (CLL) is caused b...,Cancer,,Bcl-2 (overexpression) [HSA:596] [KO:K02161]\n...,,,...,Chronic lymphocytic leukemia (CLL) is caused b...,,"Bcl-2 (overexpression) [HSA:<a href=""https://w...",,,"1,3-Butadiene [CPD:<a href=""https://www.genome...","Cyclophosphamide [DR:<a href=""https://www.geno...",,ICD-11: 2A82.0\n ICD-10: C91.1\n MeSH: <a href...,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."


In [25]:
raw_disease_df.shape

(2577, 28)

In [26]:
raw_disease_df.isna().sum()

ENTRY                  0
NAME                   0
SUBGROUP            2168
SUPERGRP            1874
DESCRIPTION            0
CATEGORY               0
NETWORK             2400
GENE                 516
PATHOGEN            2211
ENV_FACTOR          2503
CARCINOGEN          2537
DRUG                2119
COMMENT             2145
DBLINKS               35
REFERENCE              3
ENTRY_link             0
SUBGROUP_link       2168
SUPERGRP_link       1874
DESCRIPTION_link       0
NETWORK_link        2400
GENE_link            516
PATHOGEN_link       2211
ENV_FACTOR_link     2503
CARCINOGEN_link     2537
DRUG_link           2119
COMMENT_link        2145
DBLINKS_link          35
REFERENCE_link         3
dtype: int64

In [27]:
exclude = ['SUBGROUP', 'SUPERGRP', 'NETWORK', 'ENTRY_link',
           'SUBGROUP_link', 'SUPERGRP_link', 'DESCRIPTION_link',
           'NETWORK_link', 'GENE_link', 'PATHOGEN_link', 'ENV_FACTOR_link',
           'CARCINOGEN_link', 'DRUG_link', 'COMMENT_link','DBLINKS_link',
           'REFERENCE_link', 'DBLINKS', 'REFERENCE']
clean_disease_df = raw_disease_df[[col for col in raw_disease_df.columns if col not in exclude]]
clean_disease_df.head()

Unnamed: 0,ENTRY,NAME,DESCRIPTION,CATEGORY,GENE,PATHOGEN,ENV_FACTOR,CARCINOGEN,DRUG,COMMENT
0,H00001,B-cell acute lymphoblastic leukemia;\n B-cell ...,Acute lymphocytic leukemia (ALL) is a clonal s...,Cancer,BCR-ABL (translocation) [HSA:25] [KO:K06619]\n...,,,"*Thorium-232 and its decay products, administe...",Cyclophosphamide [DR:D00287]\n Methotrexate [D...,
1,H00002,T-cell acute lymphoblastic leukemia;\n T-cell ...,Acute lymphocytic leukemia (ALL) is a clonal s...,Cancer,NOTCH1 (mutation) [HSA:4851] [KO:K02599]\n TAL...,,,"*Thorium-232 and its decay products, administe...",Cyclophosphamide [DR:D00287]\n Methotrexate [D...,
2,H00003,Acute myeloid leukemia,Acute myeloid leukemia (AML) is a disease that...,Cancer,PML-RARalpha (translocation) [HSA:5371] [KO:K1...,,,"Benzene [CPD:C01407]\n 1,4-Butanediol dimethan...",Cyclophosphamide [DR:D00287]\n Thioguanine [DR...,
3,H00004,Chronic myeloid leukemia,Chronic myeloid leukemia (CML) is a clonal mye...,Cancer,BCR-ABL (translocation) [HSA:613 25] [KO:K0887...,,,"1,3-Butadiene [CPD:C16450]\n Rubber industry",Busulfan [DR:D00248]\n Thioguanine [DR:D06109]...,
4,H00005,Chronic lymphocytic leukemia,Chronic lymphocytic leukemia (CLL) is caused b...,Cancer,Bcl-2 (overexpression) [HSA:596] [KO:K02161]\n...,,,"1,3-Butadiene [CPD:C16450]\n Ethylene oxide [C...",Cyclophosphamide [DR:D00287]\n Chlorambucil [D...,


In [28]:
#2
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    description = row['DESCRIPTION']
    comment = row['COMMENT']
    category = row['CATEGORY']
    gene = row['GENE']
    pathogen = row['PATHOGEN']
    env_factor = row['ENV_FACTOR']
    carcinogen = row['CARCINOGEN']
    drug = row['DRUG']
    text = f"name: {name}; category: {category}; description: {description}; drug: {drug}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "gene": gene,
                "pathogen": pathogen,
                "env_factor": env_factor,
                "carcinogen": carcinogen,
                "drug": drug,
                "comment": comment,
            }
        }

    return output_dict

In [None]:
disease_preprocess = convert_dataframe_to_list(clean_disease_df)
print(len(disease_preprocess))
disease_preprocess[1]

2577


{'id': 'H00002',
 'input': 'name: T-cell acute lymphoblastic leukemia;\n T-cell acute lymphocytic leukemia; category: Cancer; description: Acute lymphocytic leukemia (ALL) is a clonal stem cell malignancy of excessive lymphoblast proliferation. It is now understood that ALL and lymphoblastic lymphoma are the same disease entities at the morphologic and immunophenotypic levels and classified as either B- and T-cell lymphoblastic leukemia/lymphoma (B-ALL and T-ALL). T-ALL comprises 15% of paediatric and 25% of adult ALL cases. T cell transformation is a multi-step process in which different genetic alterations cooperate to alter the normal mechanisms that control cell growth, proliferation, survival, and differentiation during thymocyte development. In this context, constitutive activation of NOTCH1 signaling is the most prominent oncogenic pathway in T cell transformation. In addition, T-ALLs characteristically show the translocation and aberrant expression of transcription factor oncog

In [30]:
disease_df = pd.DataFrame(disease_preprocess)
preprocessed_datasets.append(disease_df)
print(disease_df.shape)
disease_df.head()

(2577, 3)


Unnamed: 0,id,input,metadata
0,H00001,name: B-cell acute lymphoblastic leukemia;\n B...,{'name': 'B-cell acute lymphoblastic leukemia;...
1,H00002,name: T-cell acute lymphoblastic leukemia;\n T...,{'name': 'T-cell acute lymphoblastic leukemia;...
2,H00003,name: Acute myeloid leukemia; category: Cancer...,"{'name': 'Acute myeloid leukemia', 'text': 'na..."
3,H00004,name: Chronic myeloid leukemia; category: Canc...,"{'name': 'Chronic myeloid leukemia', 'text': '..."
4,H00005,name: Chronic lymphocytic leukemia; category: ...,"{'name': 'Chronic lymphocytic leukemia', 'text..."


#### Kegg Medicus D-Group Data Preprocessing

In [31]:
raw_dgroup_df.shape

(2398, 13)

In [32]:
raw_dgroup_df.head()

Unnamed: 0,ENTRY,CLASSIFICATION,NAME,STEM,IMAGE,MEMBER,CLASS,REMARK,COMMENT,ENTRY_link,MEMBER_link,CLASS_link,REMARK_link
0,DG00001,DGroup,Chlorhexidine\n TYPE Chemical,,,D07668 Chlorhexidine (INN)\n D00858 Chlorhex...,,ATC code: A01AB03 B05CA02 D08AC02 D09AA12 R02A...,,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,"ATC code: <a href=""https://www.kegg.jp/kegg-bi..."
1,DG00002,DGroup,Oxyquinoline\n TYPE Chemical,,,D05321 Oxyquinoline (USAN)\n D02414 Oxyquino...,,ATC code: A01AB07 D08AH03 G01AC30 R02AA14,,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,"ATC code: <a href=""https://www.kegg.jp/kegg-bi..."
2,DG00003,DGroup,Neomycin\n ABBR NEO\n TYPE Chemical,,,D08260 Neomycin (INN)\n D01618 Neomycin sulf...,Antibacterial\n DG01447 Aminoglycoside antibi...,ATC code: A01AB08 A07AA01 B05CA09 D06AX04 J01G...,,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...","<a href=""http://togodb.biosciencedbc.jp/togodb...","Antibacterial\n <a href=""https://www.kegg.jp/d...","ATC code: <a href=""https://www.kegg.jp/kegg-bi..."
3,DG00004,DGroup,Miconazole\n TYPE Chemical,,,D00416 Miconazole (JP18/USP/INN) <JP/US>\n D...,Antifungal\n DG01883 Imidazole antifungal\n M...,ATC code: A01AB09 A07AC01 D01AC02 G01AF04 J02A...,,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...","<a href=""http://togodb.biosciencedbc.jp/togodb...","Antifungal\n <a href=""https://www.kegg.jp/dbge...","ATC code: <a href=""https://www.kegg.jp/kegg-bi..."
4,DG00005,DGroup,Tetracycline\n ABBR TET\n TYPE Che...,,,D00201 Tetracycline (JAN/USP/INN)\n D02122 T...,Antibacterial\n DG01197 Tetracycline antibiotic,ATC code: A01AB13 D06AA04 J01AA07 S01AA09 S02A...,,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...","<a href=""http://togodb.biosciencedbc.jp/togodb...","Antibacterial\n <a href=""https://www.kegg.jp/d...","ATC code: <a href=""https://www.kegg.jp/kegg-bi..."


In [33]:
exclude = ['CLASSIFICATION','STEM','IMAGE','COMMENT','CLASS_link','ENTRY_link','REMARK_link','MEMBER_link']
clean_dgroup_df = raw_dgroup_df[[col for col in raw_dgroup_df.columns if col not in exclude]]

In [34]:
raw_dgroup_df.isna().sum()

ENTRY                0
CLASSIFICATION       0
NAME                 0
STEM              2232
IMAGE             2398
MEMBER               0
CLASS              570
REMARK             966
COMMENT           2387
ENTRY_link           0
MEMBER_link          0
CLASS_link         570
REMARK_link        966
dtype: int64

In [35]:
clean_dgroup_df.head()

Unnamed: 0,ENTRY,NAME,MEMBER,CLASS,REMARK
0,DG00001,Chlorhexidine\n TYPE Chemical,D07668 Chlorhexidine (INN)\n D00858 Chlorhex...,,ATC code: A01AB03 B05CA02 D08AC02 D09AA12 R02A...
1,DG00002,Oxyquinoline\n TYPE Chemical,D05321 Oxyquinoline (USAN)\n D02414 Oxyquino...,,ATC code: A01AB07 D08AH03 G01AC30 R02AA14
2,DG00003,Neomycin\n ABBR NEO\n TYPE Chemical,D08260 Neomycin (INN)\n D01618 Neomycin sulf...,Antibacterial\n DG01447 Aminoglycoside antibi...,ATC code: A01AB08 A07AA01 B05CA09 D06AX04 J01G...
3,DG00004,Miconazole\n TYPE Chemical,D00416 Miconazole (JP18/USP/INN) <JP/US>\n D...,Antifungal\n DG01883 Imidazole antifungal\n M...,ATC code: A01AB09 A07AC01 D01AC02 G01AF04 J02A...
4,DG00005,Tetracycline\n ABBR TET\n TYPE Che...,D00201 Tetracycline (JAN/USP/INN)\n D02122 T...,Antibacterial\n DG01197 Tetracycline antibiotic,ATC code: A01AB13 D06AA04 J01AA07 S01AA09 S02A...


In [36]:
#3
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    class_item = row['CLASS']
    remark = row['REMARK']
    member = row['MEMBER']
    text = f"name: {name}; member: {member}; class: {class_item};  remark: {remark}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "text": text,
                "name": name,
                "class": class_item
            }
        }

    return output_dict

In [37]:
dgroup_preprocess = convert_dataframe_to_list(clean_dgroup_df)
print(len(dgroup_preprocess))
dgroup_preprocess[0]

2398


{'id': 'DG00001',
 'input': 'name: Chlorhexidine\n TYPE        Chemical; member: D07668  Chlorhexidine (INN)\n D00858  Chlorhexidine gluconate (JP18/USP)  <JP/US>\n D01345  Chlorhexidine hydrochloride (JP18/USP)\n D03463  Chlorhexidine phosphanilate (USAN)\n D07669  Chlorhexidine acetate (USP); class: nan;  remark: ATC code: A01AB03 B05CA02 D08AC02 D09AA12 R02AA05 S01AX09 S02AA09 S03AA04',
 'metadata': {'text': 'name: Chlorhexidine\n TYPE        Chemical; member: D07668  Chlorhexidine (INN)\n D00858  Chlorhexidine gluconate (JP18/USP)  <JP/US>\n D01345  Chlorhexidine hydrochloride (JP18/USP)\n D03463  Chlorhexidine phosphanilate (USAN)\n D07669  Chlorhexidine acetate (USP); class: nan;  remark: ATC code: A01AB03 B05CA02 D08AC02 D09AA12 R02AA05 S01AX09 S02AA09 S03AA04',
  'name': 'Chlorhexidine\n TYPE        Chemical',
  'class': nan}}

In [38]:
dgroup_df = pd.DataFrame(dgroup_preprocess)
preprocessed_datasets.append(dgroup_df)
print(dgroup_df.shape)
dgroup_df.head()

(2398, 3)


Unnamed: 0,id,input,metadata
0,DG00001,name: Chlorhexidine\n TYPE Chemical; me...,{'text': 'name: Chlorhexidine  TYPE Che...
1,DG00002,name: Oxyquinoline\n TYPE Chemical; mem...,{'text': 'name: Oxyquinoline  TYPE Chem...
2,DG00003,name: Neomycin\n ABBR NEO\n TYPE C...,{'text': 'name: Neomycin  ABBR NEO  TYPE ...
3,DG00004,name: Miconazole\n TYPE Chemical; membe...,{'text': 'name: Miconazole  TYPE Chemic...
4,DG00005,name: Tetracycline\n ABBR TET\n TYPE ...,{'text': 'name: Tetracycline  ABBR TET  T...


#### Kegg Medicus Environ Data Preprocessing

In [39]:
raw_environ_df.head()

Unnamed: 0,E number,Name,Category,Component,Source,Remark,Comment,Other DBs
0,E00001,Coptis rhizome (JP17),Crude drug,"Berberine [CPD:C00757], Palmatine [CPD:C05315]...","Coptis japonica [TAX:3442], Coptis chinensis [...",Same as: D00092,Ranunculaceae (buttercup family) Coptis rhizom...,
1,E00002,Evodia fruit (JP17),Crude drug,"Evodiamine [CPD:C09187], Rutecarpine [CPD:C092...","Evodia rutaecarpa [TAX:354523], Evodia officin...",Same as: D00173,Rutaceae (rue family) Evodia fruit\r\nMajor co...,
2,E00003,Saffron (JP17),Crude drug; Medicinal herb,"Crocin [CPD:C08589], Picrocrocin [CPD:C17055],...",Crocus sativus [TAX:82528],Same as: D01030,Iridaceae (Iris family) Saffron chapiter\r\nMa...,
3,E00004,Agar (JP17/NF),Crude drug,Agarose [CPD:C01399],Gelidium amansii [TAX:2812],Same as: D01032,Gelidiaceae Gelidium amansii mucous (freeze dr...,
4,E00005,Tragacanth (JP17/NF),Crude drug,"D-Galacturonic acid [CPD:C00333], Tragacanthic...","Astragalus gummifer [TAX:339493], Leguminosae ...",Same as: D01033,Fabaceae (pea family) Tragacanth stem exudatio...,


In [40]:
raw_environ_df.shape

(850, 8)

In [41]:
exclude = ['Remark', 'Other DBs']
clean_environ_df = raw_environ_df[[col for col in raw_environ_df.columns if col not in exclude]]

In [42]:
raw_environ_df.isna().sum()

E number       0
Name           0
Category      12
Component    306
Source        22
Remark       556
Comment        9
Other DBs    757
dtype: int64

In [43]:
clean_environ_df.head()

Unnamed: 0,E number,Name,Category,Component,Source,Comment
0,E00001,Coptis rhizome (JP17),Crude drug,"Berberine [CPD:C00757], Palmatine [CPD:C05315]...","Coptis japonica [TAX:3442], Coptis chinensis [...",Ranunculaceae (buttercup family) Coptis rhizom...
1,E00002,Evodia fruit (JP17),Crude drug,"Evodiamine [CPD:C09187], Rutecarpine [CPD:C092...","Evodia rutaecarpa [TAX:354523], Evodia officin...",Rutaceae (rue family) Evodia fruit\r\nMajor co...
2,E00003,Saffron (JP17),Crude drug; Medicinal herb,"Crocin [CPD:C08589], Picrocrocin [CPD:C17055],...",Crocus sativus [TAX:82528],Iridaceae (Iris family) Saffron chapiter\r\nMa...
3,E00004,Agar (JP17/NF),Crude drug,Agarose [CPD:C01399],Gelidium amansii [TAX:2812],Gelidiaceae Gelidium amansii mucous (freeze dr...
4,E00005,Tragacanth (JP17/NF),Crude drug,"D-Galacturonic acid [CPD:C00333], Tragacanthic...","Astragalus gummifer [TAX:339493], Leguminosae ...",Fabaceae (pea family) Tragacanth stem exudatio...


In [44]:
#4
def row_to_dict(row):
    entry_id = row['E number']
    name = row['Name']
    category = row['Category']
    component = row['Component']
    source = row['Source']
    comment = row['Comment']
    text = f"name: {name}; category: {category}; component: {comment} comment: {comment}; source: {source}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name, 
                "text": text,
                "component": component,
                
            }
        }

    return output_dict

In [45]:
environ_preprocess = convert_dataframe_to_list(clean_environ_df)
print(len(environ_preprocess))
environ_preprocess[0]

850


{'id': 'E00001',
 'input': 'name: Coptis rhizome (JP17); category: Crude drug; component: Ranunculaceae (buttercup family) Coptis rhizome\r\nMajor component: Berberine [CPD:C00757] comment: Ranunculaceae (buttercup family) Coptis rhizome\r\nMajor component: Berberine [CPD:C00757]; source: Coptis japonica [TAX:3442], Coptis chinensis [TAX:261450], Coptis deltoidea [TAX:261449], Coptis teeta [TAX:261448]',
 'metadata': {'name': 'Coptis rhizome (JP17)',
  'text': 'name: Coptis rhizome (JP17); category: Crude drug; component: Ranunculaceae (buttercup family) Coptis rhizome\r\nMajor component: Berberine [CPD:C00757] comment: Ranunculaceae (buttercup family) Coptis rhizome\r\nMajor component: Berberine [CPD:C00757]; source: Coptis japonica [TAX:3442], Coptis chinensis [TAX:261450], Coptis deltoidea [TAX:261449], Coptis teeta [TAX:261448]',
  'component': 'Berberine [CPD:C00757], Palmatine [CPD:C05315], Jateorrhizine [CPD:C09553], Coptisine [CPD:C16938], Worenine [CPD:C17083], Magnoflorine [C

In [46]:
environ_df = pd.DataFrame(environ_preprocess)
preprocessed_datasets.append(environ_df)
print(environ_df.shape)
environ_df.head()

(850, 3)


Unnamed: 0,id,input,metadata
0,E00001,name: Coptis rhizome (JP17); category: Crude d...,"{'name': 'Coptis rhizome (JP17)', 'text': 'nam..."
1,E00002,name: Evodia fruit (JP17); category: Crude dru...,"{'name': 'Evodia fruit (JP17)', 'text': 'name:..."
2,E00003,name: Saffron (JP17); category: Crude drug; Me...,"{'name': 'Saffron (JP17)', 'text': 'name: Saff..."
3,E00004,name: Agar (JP17/NF); category: Crude drug; co...,"{'name': 'Agar (JP17/NF)', 'text': 'name: Agar..."
4,E00005,name: Tragacanth (JP17/NF); category: Crude dr...,"{'name': 'Tragacanth (JP17/NF)', 'text': 'name..."


#### Kegg Medicus Network Data Preprocessing

In [47]:
raw_network_df.head()

Unnamed: 0,ENTRY,NAME,DEFINITION,EXPANDED,CLASS,TYPE,DISEASE,GENE,VARIANT,METABOLITE,...,ENTRY_link,DEFINITION_link,EXPANDED_link,CLASS_link,DISEASE_link,GENE_link,VARIANT_link,METABOLITE_link,PERTURBANT_link,REFERENCE_link
0,N00001,EGF-EGFR-RAS-ERK signaling pathway,EGF -> EGFR -> GRB2 -> SOS -> RAS -> RAF -> ME...,"1950 -> 1956 -> 2885 -> (6654,6655) -> (3265,3...",nt06210 ERK signaling\nnt06260 Colorectal canc...,Reference,,1950 EGF; epidermal growth factor\n1956 EGFR...,,,...,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...",EGF -> EGFR -> GRB2 -> SOS -> RAS -> RAF -> ME...,"<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""https://www.kegg.jp/network/nt06210"">...",,"<a href=""http://www.kegg.jp/dbget-bin/www_bget...",,,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
1,N00002,BCR-ABL fusion kinase to RAS-ERK signaling pat...,BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...",nt06210 ERK signaling\nnt06276 Chronic myeloid...,Variant,H00004 Chronic myeloid leukemia,"25 ABL1; ABL proto-oncogene 1, non-receptor t...",25v1 (BCR-ABL) BCR-ABL1 fusion\n25v2 (BCR-ABL...,,...,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...",BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(<a href=""http://www.kegg.jp/dbget-bin/www_bge...","<a href=""https://www.kegg.jp/network/nt06210"">...","<a href=""http://togodb.biosciencedbc.jp/entry/...","<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
2,N00003,Mutation-activated KIT to RAS-ERK signaling pa...,KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",nt06210 ERK signaling\nnt06275 Acute myeloid l...,Variant,H00003 Acute myeloid leukemia,3815 KIT; KIT proto-oncogene receptor tyrosin...,3815v1 (KIT*) KIT kinase domain mutation,,...,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...",KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""https://www.kegg.jp/network/nt06210"">...","<a href=""http://togodb.biosciencedbc.jp/entry/...","<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
3,N00004,Duplication or mutation-activated FLT3 to RAS-...,FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...",nt06210 ERK signaling\nnt06275 Acute myeloid l...,Variant,H00003 Acute myeloid leukemia,2322 FLT3; fms related tyrosine kinase 3\n288...,2322v2 (FLT3*) FLT3 internal tandem duplicati...,,...,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...",FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(<a href=""http://www.kegg.jp/dbget-bin/www_bge...","<a href=""https://www.kegg.jp/network/nt06210"">...","<a href=""http://togodb.biosciencedbc.jp/entry/...","<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
4,N00005,Mutation-activated MET to RAS-ERK signaling pa...,MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",nt06210 ERK signaling\nnt06263 Hepatocellular ...,Variant,H00048 Hepatocellular carcinoma\nH00021 Rena...,"4233 MET; MET proto-oncogene, receptor tyrosi...",4233v1 (MET*) MET kinase domain mutation,,...,"<a href=""https://www.kegg.jp/dbget-bin/www_bge...",MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""https://www.kegg.jp/network/nt06210"">...","<a href=""http://togodb.biosciencedbc.jp/entry/...","<a href=""http://www.kegg.jp/dbget-bin/www_bget...","<a href=""http://togodb.biosciencedbc.jp/togodb...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."


In [48]:
raw_network_df.shape 

(1241, 22)

In [49]:
exclude = ['ENTRY_link','VARIANT','METABOLITE','PERTURBANT','DEFINITION_link', 'EXPANDED_link', 'CLASS_link', 'DISEASE_link', 'GENE_link', 'VARIANT_link',
           'METABOLITE_link', 'PERTURBANT_link','REFERENCE_link']
clean_network_df = raw_network_df[[col for col in raw_network_df.columns if col not in exclude]]

In [50]:
clean_network_df.head()

Unnamed: 0,ENTRY,NAME,DEFINITION,EXPANDED,CLASS,TYPE,DISEASE,GENE,REFERENCE
0,N00001,EGF-EGFR-RAS-ERK signaling pathway,EGF -> EGFR -> GRB2 -> SOS -> RAS -> RAF -> ME...,"1950 -> 1956 -> 2885 -> (6654,6655) -> (3265,3...",nt06210 ERK signaling\nnt06260 Colorectal canc...,Reference,,1950 EGF; epidermal growth factor\n1956 EGFR...,"PMID:17409820\nAUTHORS Molina JR, Adjei AA\n..."
1,N00002,BCR-ABL fusion kinase to RAS-ERK signaling pat...,BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...",nt06210 ERK signaling\nnt06276 Chronic myeloid...,Variant,H00004 Chronic myeloid leukemia,"25 ABL1; ABL proto-oncogene 1, non-receptor t...","PMID:10403855\nAUTHORS Faderl S, Talpaz M, E..."
2,N00003,Mutation-activated KIT to RAS-ERK signaling pa...,KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",nt06210 ERK signaling\nnt06275 Acute myeloid l...,Variant,H00003 Acute myeloid leukemia,3815 KIT; KIT proto-oncogene receptor tyrosin...,"PMID:23678293\nAUTHORS Liang J, Wu YL, Chen ..."
3,N00004,Duplication or mutation-activated FLT3 to RAS-...,FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...",nt06210 ERK signaling\nnt06275 Acute myeloid l...,Variant,H00003 Acute myeloid leukemia,2322 FLT3; fms related tyrosine kinase 3\n288...,"PMID:12951584\nAUTHORS Stirewalt DL, Radich ..."
4,N00005,Mutation-activated MET to RAS-ERK signaling pa...,MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",nt06210 ERK signaling\nnt06263 Hepatocellular ...,Variant,H00048 Hepatocellular carcinoma\nH00021 Rena...,"4233 MET; MET proto-oncogene, receptor tyrosi...","PMID:14685170\nAUTHORS Birchmeier C, Birchme..."


In [51]:
raw_network_df.isna().sum()

ENTRY                0
NAME                 0
DEFINITION           0
EXPANDED             0
CLASS               24
TYPE                 0
DISEASE            361
GENE                 4
VARIANT            867
METABOLITE         924
PERTURBANT         662
REFERENCE           76
ENTRY_link           0
DEFINITION_link      0
EXPANDED_link        0
CLASS_link          24
DISEASE_link       361
GENE_link            4
VARIANT_link       867
METABOLITE_link    924
PERTURBANT_link    662
REFERENCE_link      76
dtype: int64

In [52]:
clean_network_df.columns

Index(['ENTRY', 'NAME', 'DEFINITION', 'EXPANDED', 'CLASS', 'TYPE', 'DISEASE',
       'GENE', 'REFERENCE'],
      dtype='object')

In [53]:
#5
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    definition = row['DEFINITION']
    expanded = row['EXPANDED']
    item_class = row['CLASS']
    item_type = row['TYPE']
    disease = row['DISEASE']
    gene = row['GENE']
    text = f"name: {name}; definition: {definition}; expanded: {expanded}; class: {item_class}, type: {item_type}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "name": name,
                "class": item_class,
                "comment": item_type,
                "disease": disease,
                "gene":gene
            }
        }

    return output_dict

In [54]:
network_preprocess = convert_dataframe_to_list(clean_network_df)
print(len(network_preprocess))
network_preprocess[0]

1241


{'id': 'N00001',
 'input': 'name: EGF-EGFR-RAS-ERK signaling pathway; definition: EGF -> EGFR -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK -> CCND1; expanded: 1950 -> 1956 -> 2885 -> (6654,6655) -> (3265,3845,4893) -> (369,673,5894) -> (5604,5605) -> (5594,5595) -> 595; class: nt06210 ERK signaling\nnt06260 Colorectal cancer\nnt06263 Hepatocellular carcinoma\nnt06265 Bladder cancer\nnt06266 Non-small cell lung cancer\nnt06268 Melanoma\nnt06270 Breast cancer\nnt06271 Endometrial cancer\nnt06273 Glioma\nnt06274 Thyroid cancer\nnt06276 Chronic myeloid leukemia\nnt06110 MAPK signaling (viruses and bacteria)\nnt06162 Hepatitis B virus (HBV)\nnt06163 Hepatitis C virus (HCV)\nnt06170 Influenza A virus (IAV)\nnt06167 Human cytomegalovirus (HCMV)\nnt06166 Human papillomavirus (HPV), type: Reference',
 'metadata': {'name': 'EGF-EGFR-RAS-ERK signaling pathway',
  'text': 'name: EGF-EGFR-RAS-ERK signaling pathway; definition: EGF -> EGFR -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK -> CCND1; expanded:

In [55]:
network_df = pd.DataFrame(network_preprocess)
preprocessed_datasets.append(network_df)
print(network_df.shape)
network_df.head()

(1241, 3)


Unnamed: 0,id,input,metadata
0,N00001,name: EGF-EGFR-RAS-ERK signaling pathway; defi...,"{'name': 'EGF-EGFR-RAS-ERK signaling pathway',..."
1,N00002,name: BCR-ABL fusion kinase to RAS-ERK signali...,{'name': 'BCR-ABL fusion kinase to RAS-ERK sig...
2,N00003,name: Mutation-activated KIT to RAS-ERK signal...,{'name': 'Mutation-activated KIT to RAS-ERK si...
3,N00004,name: Duplication or mutation-activated FLT3 t...,{'name': 'Duplication or mutation-activated FL...
4,N00005,name: Mutation-activated MET to RAS-ERK signal...,{'name': 'Mutation-activated MET to RAS-ERK si...


#### Kegg Medicus Variant Data Preprocessing

In [56]:
raw_variant_df.head()

Unnamed: 0,ENTRY,NAME,GENE,ORGANISM,VARIATION,NETWORK,ELEMENT,REFERENCE,ENTRY_link,GENE_link,VARIATION_link,NETWORK_link,ELEMENT_link,REFERENCE_link
0,10026v1,PIGK deficiency,PIGK phosphatidylinositol glycan anchor biosy...,hsa_var Human gene variants (Homo sapiens),mutations\nOmimVar: 605087,,,"PMID:32220290\nAUTHORS Nguyen TTM, Murakami ...","<a href=""https://www.genome.jp/dbget-bin/www_b...",PIGK phosphatidylinositol glycan anchor biosy...,"mutations\nOmimVar: <a href=""https://omim.org/...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
1,100v1,ADA deficiency,ADA adenosine deaminase [KO:K01488],hsa_var Human gene variants (Homo sapiens),mutations\nOmimVar: 608958\nDRUG_TARGET Elapeg...,,,,"<a href=""https://www.genome.jp/dbget-bin/www_b...","ADA adenosine deaminase [KO:<a href=""http://w...","mutations\nOmimVar: <a href=""https://omim.org/...",,,
2,10133v1,OPTN mutation,OPTN optineurin [KO:K19946],hsa_var Human gene variants (Homo sapiens),mutations\nOmimVar: 10133,nt06421 Mitophagy\nnt06464 Amyotrophic later...,N01138 Mutation-inactivated OPTN to PINK-Park...,"PMID:20428114\nAUTHORS Maruyama H, Morino H,...","<a href=""https://www.genome.jp/dbget-bin/www_b...","OPTN optineurin [KO:<a href=""http://www.kegg....","mutations\nOmimVar: <a href=""https://omim.org/...","<a href=""http://togodb.biosciencedbc.jp/togodb...","<a href=""http://togodb.biosciencedbc.jp/togodb...","PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
3,10195v1,ALG3 deficiency,"ALG3 ALG3 alpha-1,3- mannosyltransferase [KO:...",hsa_var Human gene variants (Homo sapiens),mutations\nOmimVar: 608750,,,"PMID:15108280\nAUTHORS Denecke J, Kranz C, K...","<a href=""https://www.genome.jp/dbget-bin/www_b...","ALG3 ALG3 alpha-1,3- mannosyltransferase [KO:...","mutations\nOmimVar: <a href=""https://omim.org/...",,,"PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."
4,1019v1,CDK4 amplification,CDK4 cyclin dependent kinase 4 [KO:K02089],hsa_var Human gene variants (Homo sapiens),amplification\nClinVar: 268075 150740\ndbVar: ...,nt06230 Cell cycle\nnt06273 Glioma,N00072 Amplified CDK4 to cell cycle G1/S,"PMID:7987821\nAUTHORS Schmidt EE, Ichimura K...","<a href=""https://www.genome.jp/dbget-bin/www_b...","CDK4 cyclin dependent kinase 4 [KO:<a href=""h...","amplification\nClinVar: <a href=""https://www.n...","<a href=""http://togodb.biosciencedbc.jp/togodb...","<a href=""http://togodb.biosciencedbc.jp/togodb...","PMID:<a href=""https://pubmed.ncbi.nlm.nih.gov/..."


In [57]:
raw_variant_df.shape 

(458, 14)

In [58]:
raw_variant_df.isna().sum()

ENTRY               0
NAME                0
GENE                0
ORGANISM            0
VARIATION           0
NETWORK           210
ELEMENT           198
REFERENCE          20
ENTRY_link          0
GENE_link           0
VARIATION_link      0
NETWORK_link      210
ELEMENT_link      198
REFERENCE_link     20
dtype: int64

In [59]:
exclude = ['ENTRY_link', 'NETWORK_link', 'GENE_link','ORGANISM', 'VARIATION_link',
           'ELEMENT_link', 'REFERENCE_link']
clean_variant_df = raw_variant_df[[col for col in raw_variant_df.columns if col not in exclude]]

In [60]:
clean_variant_df.head()

Unnamed: 0,ENTRY,NAME,GENE,VARIATION,NETWORK,ELEMENT,REFERENCE
0,10026v1,PIGK deficiency,PIGK phosphatidylinositol glycan anchor biosy...,mutations\nOmimVar: 605087,,,"PMID:32220290\nAUTHORS Nguyen TTM, Murakami ..."
1,100v1,ADA deficiency,ADA adenosine deaminase [KO:K01488],mutations\nOmimVar: 608958\nDRUG_TARGET Elapeg...,,,
2,10133v1,OPTN mutation,OPTN optineurin [KO:K19946],mutations\nOmimVar: 10133,nt06421 Mitophagy\nnt06464 Amyotrophic later...,N01138 Mutation-inactivated OPTN to PINK-Park...,"PMID:20428114\nAUTHORS Maruyama H, Morino H,..."
3,10195v1,ALG3 deficiency,"ALG3 ALG3 alpha-1,3- mannosyltransferase [KO:...",mutations\nOmimVar: 608750,,,"PMID:15108280\nAUTHORS Denecke J, Kranz C, K..."
4,1019v1,CDK4 amplification,CDK4 cyclin dependent kinase 4 [KO:K02089],amplification\nClinVar: 268075 150740\ndbVar: ...,nt06230 Cell cycle\nnt06273 Glioma,N00072 Amplified CDK4 to cell cycle G1/S,"PMID:7987821\nAUTHORS Schmidt EE, Ichimura K..."


In [61]:
clean_variant_df.columns

Index(['ENTRY', 'NAME', 'GENE', 'VARIATION', 'NETWORK', 'ELEMENT',
       'REFERENCE'],
      dtype='object')

In [62]:
#6
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    gene = row['GENE']
    variation = row['VARIATION']
    network = row['NETWORK']
    element = row['ELEMENT']
    reference = row['REFERENCE']
    text = f"name: {name}; gene: {gene}; variation: {variation}; network{network}; element{element}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "network": network,
                "element": element,
                "reference": reference
            }
        }

    return output_dict

In [63]:
variant_preprocess = convert_dataframe_to_list(clean_variant_df)
print(len(variant_preprocess))
variant_preprocess[0]

458


{'id': '10026v1',
 'input': 'name: PIGK deficiency; gene: PIGK  phosphatidylinositol glycan anchor biosynthesis class K [KO:K05290]; variation: mutations\nOmimVar: 605087; networknan; elementnan',
 'metadata': {'name': 'PIGK deficiency',
  'text': 'name: PIGK deficiency; gene: PIGK  phosphatidylinositol glycan anchor biosynthesis class K [KO:K05290]; variation: mutations\nOmimVar: 605087; networknan; elementnan',
  'network': nan,
  'element': nan,
  'reference': 'PMID:32220290\nAUTHORS   Nguyen TTM, Murakami Y, Mobilio S, Niceta M, Zampino G, Philippe C, Moutton S, Zaki MS, James KN, Musaev D, Mu W, Baranano K, Nance JR, Rosenfeld JA, Braverman N, Ciolfi A, Millan F, Person RE, Bruel AL, Thauvin-Robinet C, Ververi A, DeVile C, Male A, Efthymiou S, Maroofian R, Houlden H, Maqbool S, Rahman F, Baratang NV, Rousseau J, St-Denis A, Elrick MJ, Anselm I, Rodan LH, Tartaglia M, Gleeson J, Kinoshita T, Campeau PM\nTITLE     Bi-allelic Variants in the GPI Transamidase Subunit PIGK Cause a Neur

In [64]:
variant_df = pd.DataFrame(variant_preprocess)
preprocessed_datasets.append(variant_df)
print(variant_df.shape)
variant_df.head()

(458, 3)


Unnamed: 0,id,input,metadata
0,10026v1,name: PIGK deficiency; gene: PIGK phosphatidy...,"{'name': 'PIGK deficiency', 'text': 'name: PIG..."
1,100v1,name: ADA deficiency; gene: ADA adenosine dea...,"{'name': 'ADA deficiency', 'text': 'name: ADA ..."
2,10133v1,name: OPTN mutation; gene: OPTN optineurin [K...,"{'name': 'OPTN mutation', 'text': 'name: OPTN ..."
3,10195v1,name: ALG3 deficiency; gene: ALG3 ALG3 alpha-...,"{'name': 'ALG3 deficiency', 'text': 'name: ALG..."
4,1019v1,name: CDK4 amplification; gene: CDK4 cyclin d...,"{'name': 'CDK4 amplification', 'text': 'name: ..."


### Data Preprocessing (Creating Embeddings)

In [65]:
index_data = []

In [66]:
def get_embedding(text):
    return embed.embed_documents(text)[0]

In [68]:
for df in preprocessed_datasets:
    print(f"indexing {df.shape}")
    df['values'] = df['input'].apply(get_embedding)

indexing (11938, 3)


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID de0796e13eb9f6a9de50ab52230c7013 in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID de0796e13eb9f6a9de50ab52230c7013 in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID de0796e13eb9

indexing (2577, 3)
indexing (2398, 3)


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/embeddings (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)'))).
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/embeddings (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)'))).
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end c

indexing (850, 3)
indexing (1241, 3)
indexing (458, 3)


In [69]:
drugs_index = drugs_df.drop(['input'], axis=1)
drugs_dataset = pd.DataFrame.to_csv(drugs_index)
#with open('index_datasets/drugs.csv', 'w') as f: 
#    f.write(drugs_dataset)
index_data.append(drugs_index)
drugs_index.head()

Unnamed: 0,id,metadata,values
0,D00001,{'name': 'Water (JP18/USP); Purified water (JP...,"[-0.006311894406848522, -0.02489179188784771, ..."
1,D00002,{'name': 'Nadide (JAN/USAN/INN); Nicotinamide ...,"[-0.006311894406848522, -0.02489179188784771, ..."
2,D00003,"{'name': 'Oxygen (JP18/USP)', 'text': 'name: O...","[-0.00636011841067576, -0.024926392883966173, ..."
3,D00004,{'name': 'Carbon dioxide (JP18/USP); Carbon di...,"[-0.006311894406848522, -0.02489179188784771, ..."
4,D00005,{'name': 'Flavin adenine dinucleotide (JAN); A...,"[-0.006311894406848522, -0.02489179188784771, ..."


In [70]:
disease_index = disease_df.drop(['input'], axis=1)
disease_dataset = pd.DataFrame.to_csv(disease_index)
#with open('index_datasets/disease.csv', 'w') as f: 
#    f.write(disease_dataset)
index_data.append(disease_index)
disease_index.head()

Unnamed: 0,id,metadata,values
0,H00001,{'name': 'B-cell acute lymphoblastic leukemia;...,"[-0.00638153408315556, -0.024893257889223787, ..."
1,H00002,{'name': 'T-cell acute lymphoblastic leukemia;...,"[-0.006311894406848522, -0.02489179188784771, ..."
2,H00003,"{'name': 'Acute myeloid leukemia', 'text': 'na...","[-0.006311894406848522, -0.02489179188784771, ..."
3,H00004,"{'name': 'Chronic myeloid leukemia', 'text': '...","[-0.006348727372756122, -0.02492023854670328, ..."
4,H00005,"{'name': 'Chronic lymphocytic leukemia', 'text...","[-0.006311894406848522, -0.02489179188784771, ..."


In [71]:
dgroup_index = dgroup_df.drop(['input'], axis=1)
dgroup_dataset = pd.DataFrame.to_csv(dgroup_index)
#with open('index_datasets/dgroup.csv', 'w') as f: 
#   f.write(dgroup_dataset)
index_data.append(dgroup_index)
dgroup_index.head()

Unnamed: 0,id,metadata,values
0,DG00001,{'text': 'name: Chlorhexidine  TYPE Che...,"[-0.006311894406848522, -0.02489179188784771, ..."
1,DG00002,{'text': 'name: Oxyquinoline  TYPE Chem...,"[-0.006341026706393906, -0.024902941958120584,..."
2,DG00003,{'text': 'name: Neomycin  ABBR NEO  TYPE ...,"[-0.006348727372756122, -0.02492023854670328, ..."
3,DG00004,{'text': 'name: Miconazole  TYPE Chemic...,"[-0.006380894490020517, -0.024877912481013318,..."
4,DG00005,{'text': 'name: Tetracycline  ABBR TET  T...,"[-0.006348727372756122, -0.02492023854670328, ..."


In [72]:
environ_index = environ_df.drop(['input'], axis=1)
environ_dataset = pd.DataFrame.to_csv(environ_index)
#with open('index_datasets/environ.csv', 'w') as f: 
#    f.write(environ_dataset)
index_data.append(environ_index)
environ_index.head()

Unnamed: 0,id,metadata,values
0,E00001,"{'name': 'Coptis rhizome (JP17)', 'text': 'nam...","[-0.006311894406848522, -0.02489179188784771, ..."
1,E00002,"{'name': 'Evodia fruit (JP17)', 'text': 'name:...","[-0.006311894406848522, -0.02489179188784771, ..."
2,E00003,"{'name': 'Saffron (JP17)', 'text': 'name: Saff...","[-0.006311894406848522, -0.02489179188784771, ..."
3,E00004,"{'name': 'Agar (JP17/NF)', 'text': 'name: Agar...","[-0.00636011841067576, -0.024926392883966173, ..."
4,E00005,"{'name': 'Tragacanth (JP17/NF)', 'text': 'name...","[-0.006311894406848522, -0.02489179188784771, ..."


In [73]:
variant_index = variant_df.drop(['input'], axis=1)
variant_dataset = pd.DataFrame.to_csv(variant_index)
#with open('index_datasets/variant.csv', 'w') as f: 
#    f.write(variant_dataset)
index_data.append(variant_index)
variant_index.head()

Unnamed: 0,id,metadata,values
0,10026v1,"{'name': 'PIGK deficiency', 'text': 'name: PIG...","[-0.006311894406848522, -0.02489179188784771, ..."
1,100v1,"{'name': 'ADA deficiency', 'text': 'name: ADA ...","[-0.00636011841067576, -0.024926392883966173, ..."
2,10133v1,"{'name': 'OPTN mutation', 'text': 'name: OPTN ...","[-0.006311894406848522, -0.02489179188784771, ..."
3,10195v1,"{'name': 'ALG3 deficiency', 'text': 'name: ALG...","[-0.00636011841067576, -0.024926392883966173, ..."
4,1019v1,"{'name': 'CDK4 amplification', 'text': 'name: ...","[-0.006311894406848522, -0.02489179188784771, ..."


In [74]:
network_index =  network_df.drop(['input'], axis=1)
network_dataset = pd.DataFrame.to_csv(network_index)
index_data.append
#with open('index_datasets/network.csv', 'w') as f: 
#    f.write(network_dataset)
index_data.append(network_index)
network_index.head()

Unnamed: 0,id,metadata,values
0,N00001,"{'name': 'EGF-EGFR-RAS-ERK signaling pathway',...","[-0.006311894406848522, -0.02489179188784771, ..."
1,N00002,{'name': 'BCR-ABL fusion kinase to RAS-ERK sig...,"[-0.006306162547569985, -0.02492137430814508, ..."
2,N00003,{'name': 'Mutation-activated KIT to RAS-ERK si...,"[-0.006311894406848522, -0.02489179188784771, ..."
3,N00004,{'name': 'Duplication or mutation-activated FL...,"[-0.006311894406848522, -0.02489179188784771, ..."
4,N00005,{'name': 'Mutation-activated MET to RAS-ERK si...,"[-0.006306162547569985, -0.02492137430814508, ..."


### Data Storage

In [75]:
index_name = "kegg-medicus-database-index"

In [76]:
import pinecone

pinecone_api_key = os.getenv('PINECONE_API_KEY')

pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment='gcp-starter'
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension= len(result[0])  # 1536 dim of text-embedding-ada-002
    )

  from tqdm.autonotebook import tqdm


In [77]:
kegg_medicus_index = pinecone.GRPCIndex(index_name)
kegg_medicus_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [80]:
for df in index_data:
    kegg_medicus_index.upsert_from_dataframe(df, batch_size=100)

sending upsert requests:   0%|          | 0/11938 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/120 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/2577 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/26 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/2398 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/24 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/850 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/9 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/458 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/5 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/1241 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/13 [00:00<?, ?it/s]

In [81]:
kegg_medicus_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.19462,
 'namespaces': {'': {'vector_count': 19462}},
 'total_vector_count': 19462}