In [1]:
import json
import pandas as pd
import re
from tqdm import tqdm

In [2]:
# !pip3 install flair
# !pip3 install scispacy==0.2.5
# !pip3 install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
# !python3.8 -m spacy download en_core_web_sm
# !pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
# !pip3 install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz

In [3]:
from flair.models import MultiTagger
from flair.data import Sentence
from flair.tokenization import SciSpacyTokenizer, SciSpacySentenceSplitter


In [4]:
texts_path = 'test.json'
ttd_path = 'P1-06-Target_disease.txt'

In [5]:
with open(texts_path, 'r') as f:
    disease_text_data = json.load(f)
    
with open(ttd_path, 'r') as f:
    ttd = f.read()

## General plan

- Get dictionary mapping diseases to set of genes/proteins extracted from sentences
- Get dictionary mapping diseases to set of genes/proteins extracted from TTD
- Find each disease from the first dict in the second dict and find intersection of the disease genes


### 1. Extract genes from texts for each disease with flair library to get dictionaty mapping a disease to genes 

In [6]:
tagger = MultiTagger.load("hunflair-gene")
splitter = SciSpacySentenceSplitter()


2022-09-03 21:27:40,603 loading file /home/ed/.flair/models/hunflair-gene-full-v1.0.pt
2022-09-03 21:27:43,298 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, S-Gene, B-Gene, I-Gene, E-Gene, <START>, <STOP>


In [8]:
disease2genes = dict((disease, set()) for disease in disease_text_data)
    
for disease, texts in tqdm(disease_text_data.items()):
    for text in texts:
        sentences = splitter.split(text)
        tagger.predict(sentences)#, use_tokenizer=SciSpacyTokenizer())
        for sentence in sentences:
            for gene in sentence.get_spans("hunflair-gene"):
                unified_gene_name = re.sub(r'[^ \w+]', '', gene.text)
                disease2genes[disease].add(unified_gene_name)


In [9]:
disease2genes

{'Bacterial pneumonia': {'ATPase',
  'Bacterial protein A',
  'CD45',
  'DNA methyltransferases',
  'FIP3',
  'Hsp70',
  'LPS receptor',
  'NS1B',
  'NS4A',
  'P  glycoprotein',
  'Phosphatases',
  'Proteases',
  'Sulfolipase A2',
  'TMY',
  'TNF  alpha',
  'TNF family',
  'VAMP',
  'anti  phosphatase',
  'big gun  protein phosphatases',
  'enzymes',
  'phosphatidylinositol 3kinase',
  'protease',
  'proteases',
  'protein phosphatases',
  'tryptophan dehydrogenase II',
  'tyrosine kinases',
  'viral proteases'},
 'Cutaneous leishmaniasis': {'Anti  IgM antibody',
  'C5b  RFLP',
  'C5b  RFLP2',
  'CA',
  'CD19',
  'CD28',
  'Class II antigens',
  'Cytotoxic antigen',
  'EBOV1',
  'FCR1',
  'FCR2',
  'Hemoglobin',
  'IL2 and 4',
  'IgA  alpha',
  'IgG',
  'L  cART',
  'L4 gene',
  'LPL',
  'LRRK2',
  'MHC',
  'MLL',
  'Mucin',
  'NGF',
  'NTCA receptor',
  'Nk42',
  'PKA',
  'RAS',
  'T  beta1',
  'T  beta2',
  'T cell receptor',
  'TCR',
  'TGF  beta',
  'TLA',
  'TLA+cut2',
  'TLR3',
 

In [10]:
len(disease2genes)

207

### 2. Get dictionary mapping diseases to set of genes/proteins extracted from TTD

In [11]:
ttd_data = ttd.split('\n\n')[-1].split('\n\t\t\n')

In [12]:
disease_tagger = MultiTagger.load("hunflair-disease")


def process_target_ttd_info(s):
    data = list(map(lambda x: x.split('\t'), s.split('\n')))

    target_dict = {'TARGETID': data[0][-1], 
                   'TARGNAME': data[1][-1], 
                   'DISEASES': set()
                  }
    for row in data[2:]:
        sentence = Sentence(row[-1])
        disease_tagger.predict(sentence)
        disease = re.sub(r'[^ \w+]', '', sentence.text)
        target_dict['DISEASES'].add(disease)
        
    return target_dict

2022-09-03 21:28:44,081 loading file /home/ed/.flair/models/hunflair-disease-full-v1.0.pt
2022-09-03 21:28:47,346 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, B-Disease, E-Disease, I-Disease, S-Disease, <START>, <STOP>


In [14]:
disease2genes_ttd = {}
t_name2t_id = {}
t_id2t_meta = {}

for data in tqdm(ttd_data):
    target_dict = process_target_ttd_info(data)
    t_name2t_id[target_dict['TARGNAME']] = target_dict['TARGETID']
    t_id2t_meta[target_dict['TARGETID']] = [target_dict['TARGNAME'], target_dict['DISEASES']]
    for disease in target_dict['DISEASES']:
        if disease not in disease2genes_ttd:
            disease2genes_ttd[disease] = set([target_dict['TARGNAME']])
        else:
            disease2genes_ttd[disease].add(target_dict['TARGNAME'])
        

100%|██████████████████████████████████████▉| 2372/2373 [01:53<00:00, 15.15it/s]



100%|███████████████████████████████████████| 2373/2373 [01:53<00:00, 20.85it/s]


In [15]:
t_name2t_id

{'Transforming growth factor alpha (TGFA)': 'T00033',
 'CTGF messenger RNA (CTGF mRNA)': 'T00039',
 'HUMAN cyclin G-associated kinase (GAK)': 'T00043',
 'HUMAN calpain-1/calpain small subunit 1 heterodimer (CAPN1/CAPNS1)': 'T00099',
 'Ubiquitin-activating enzyme (UBA)': 'T00100',
 'Arachidonate 5-lipoxygenase (5-LOX)': 'T00140',
 'HUMAN pH-dependent viral fusion/replication (pH-DVF/R)': 'T00145',
 'Short transient receptor potential channel 5 (TRPC5)': 'T00156',
 'HUMAN glycosylation of host receptor (GHR)': 'T00158',
 'Ubiquitin-protein ligase E3 Mdm2 (MDM2)': 'T00176',
 'HUAMN alpha-1 adrenergic receptor (ADRA1)': 'T00216',
 'NFKB messenger RNA (NFKB mRNA)': 'T00238',
 'Interleukin-4 (IL4)': 'T00239',
 'V-erbA-related protein 1 (NR1D1)': 'T00254',
 'HUAMN eIF4E-eIF4G interaction (eIF4E-eIF4G)': 'T00259',
 'Fast skeletal muscle troponin complex (TNNC2)': 'T00419',
 'Human immunodeficiency virus Rev protein (HIV rev)': 'T00420',
 'Serine Racemase (SRR)': 'T00477',
 'Interleukin 23 rece

In [16]:
disease2genes_ttd

{'Chronic kidney disease  ICD11  GB61 ': {'5-HT 2A receptor (HTR2A)',
  'Adrenergic receptor alpha-2C (ADRA2C)',
  'Advanced glycosylation end product receptor (AGER)',
  'Androgen receptor (AR)',
  'Angiotensin-converting enzyme (ACE)',
  'Apoptosis signal-regulating kinase 1 (MAP3K5)',
  'B2 bradykinin receptor (BDKRB2)',
  'Beta-secretase (BACE)',
  'C-C chemokine receptor type 2 (CCR2)',
  'C-C chemokine receptor type 5 (CCR5)',
  'Cannabinoid receptor 1 (CB1)',
  'Coagulation factor XI (F11)',
  'Connective tissue growth factor (CTGF)',
  'Dopamine D2 receptor (D2R)',
  'Endothelin A receptor (EDNRA)',
  'Epiregulin (EREG)',
  'Farnesoid X-activated receptor (FXR)',
  'Galectin (LGALS)',
  'Immunoglobulin gamma Fc receptor IIA (FCGR2A)',
  'Insulin-like growth factor I receptor (IGF1R)',
  'MAP kinase p38 (MAPK12)',
  'Matrix metalloproteinase-1 (MMP-1)',
  'Membrane copper amine oxidase (AOC3)',
  'Mineralocorticoid receptor  (MR)',
  'Neutral endopeptidase (MME)',
  'Nitric-oxid

In [18]:
len(disease2genes_ttd)

748

### 3. Find mapping of a disease from test.json to TTD data

Scince the disease labels in test.json are generally shorter, the mapping rule will be the following:
if all the words are found in labels from TTD. 

Intersection of the sets of disease words is the criterion => we need to prepare disease to its words dictionary.

**Itwould be much better to use the comparison of the embedding of disease names to match them but I didn't found suitable opensource model.** 

As concerns genes match, the logic is similar.

In [19]:
disease_word_sets = dict(map(lambda x: (x, set([y.lower() for y in x.split()])), disease2genes.keys()))

In [20]:
disease_word_sets_ttd = dict(map(lambda x: (x, set([y.lower() for y in x.split()])), disease2genes_ttd.keys()))

In [21]:
disease_word_sets

{'Bacterial pneumonia': {'bacterial', 'pneumonia'},
 'Cutaneous leishmaniasis': {'cutaneous', 'leishmaniasis'},
 'Dedifferentiated liposarcoma': {'dedifferentiated', 'liposarcoma'},
 'Methylmalonic acidemia': {'acidemia', 'methylmalonic'},
 'Conjunctival melanoma': {'conjunctival', 'melanoma'},
 'Acute myocardial infarction': {'acute', 'infarction', 'myocardial'},
 'Psoriatic arthritis': {'arthritis', 'psoriatic'},
 'Ataxia-telangiectasia': {'ataxia-telangiectasia'},
 'Chronic kidney disease': {'chronic', 'disease', 'kidney'},
 'Cholangio carcinoma': {'carcinoma', 'cholangio'},
 'Werner syndrome': {'syndrome', 'werner'},
 'Osteoarthritis': {'osteoarthritis'},
 'Interstitial cystitis': {'cystitis', 'interstitial'},
 'Optic neuritis': {'neuritis', 'optic'},
 'Eosinophilic esophagitis': {'eosinophilic', 'esophagitis'},
 'Dermatomyositis': {'dermatomyositis'},
 'Hermansky-Pudlak syndrome': {'hermansky-pudlak', 'syndrome'},
 'IGA glomerulonephritis': {'glomerulonephritis', 'iga'},
 'Cirrhos

In [22]:
disease_word_sets_ttd

{'Chronic kidney disease  ICD11  GB61 ': {'chronic',
  'disease',
  'gb61',
  'icd11',
  'kidney'},
 'Fibrosis  ICD11  GA14GC01 ': {'fibrosis', 'ga14gc01', 'icd11'},
 'Rheumatoid arthritis  ICD11  FA20 ': {'arthritis',
  'fa20',
  'icd11',
  'rheumatoid'},
 'COVID19  ICD11  1D6Y ': {'1d6y', 'covid19', 'icd11'},
 'Solid tumour  cancer  ICD11  2A002F9Z ': {'2a002f9z',
  'cancer',
  'icd11',
  'solid',
  'tumour'},
 'Cardiovascular disease  ICD11  BA00BE2Z ': {'ba00be2z',
  'cardiovascular',
  'disease',
  'icd11'},
 'Lung cancer  ICD11  2C25 ': {'2c25', 'cancer', 'icd11', 'lung'},
 'Thrombocytopenia  ICD11  3B64 ': {'3b64', 'icd11', 'thrombocytopenia'},
 'Filariasis  ICD11  1F66 ': {'1f66', 'filariasis', 'icd11'},
 'Chronic obstructive pulmonary disease  ICD11  CA22 ': {'ca22',
  'chronic',
  'disease',
  'icd11',
  'obstructive',
  'pulmonary'},
 'Asthma  ICD11  CA23 ': {'asthma', 'ca23', 'icd11'},
 'Pituitary gland disorder  ICD11  5A605A61 ': {'5a605a61',
  'disorder',
  'gland',
  'i

### Functions to match diaseases and genes from test.json and TTD

In [23]:
# conventional transformation for genes from sentences and from TTD
def process_gene(gene):
    words = gene.lower().split()
    for i in range(len(words)):
        if words[i] == 's':
            words[i] = words[i][:-1]
            
    return words


# gene match rool: all first gene words are included in second set or vice versa
def check_genes_intesection(gene0, gene1):
    
    return len(set(gene0) & set(gene1)) == min(len(gene0), len(gene0))
    
    
# find matches between 2 gene sets for the same disease (from different sources)
def check_gene_sets_intersection(genes0, genes1):

    intersection_dict = {}
    
    for gene0 in genes0:
        gene0_processed = process_gene(gene0)
        for gene1 in genes1:
            gene1_processed = process_gene(gene1)
            same = check_genes_intesection(gene0_processed, gene1_processed)
            if same:
                if gene0 in intersection_dict:
                    intersection_dict[gene0].append(gene1)
                else:
                    intersection_dict[gene0] = [gene1]
    return intersection_dict


# compare 2 disease names and decide if they different version of the same disease
def check_diseases_intersection(disease0_words, disease1_words):
    n_words = min(len(disease0_words), len(disease1_words))
    intersection = len(disease_words & disease_words_ttd) == n_words
    return intersection


# for each target of the disease find target_id from TTD
def get_disease_result(disease0, disease0_words, disease1, disease1_words):
    result = []
    intersection = check_diseases_intersection(disease0_words, disease1_words)
    if intersection:
        intersection_dict = check_gene_sets_intersection(disease2genes[disease0], disease2genes_ttd[disease1])
        for gene, genes1 in intersection_dict.items():
            for  gene1 in genes1:
                result.append([disease0, gene, t_name2t_id[gene1]])
    return result



In [25]:
all_data = []

for disease, disease_words in disease_word_sets.items():
    for disease_ttd, disease_words_ttd in disease_word_sets_ttd.items():
        data = get_disease_result(disease, disease_words, disease_ttd, disease_words_ttd)
        if len(data):
            all_data.extend(data)
            
            
df = pd.DataFrame(all_data, columns=['disease', 'target', 'ttd_target_id'])

In [26]:
df.head()

Unnamed: 0,disease,target,ttd_target_id
0,Acute myocardial infarction,endothelial growth factor,T99993
1,Acute myocardial infarction,vascular endothelial growth factor,T99993
2,Psoriatic arthritis,Protein kinase,T06093
3,Psoriatic arthritis,cytokine,T06542
4,Chronic kidney disease,protein kinase,T51282


In [28]:
df.to_csv('matched_target_id.csv')

In [36]:
df['ttd_target_name'] = df['ttd_target_id'].map(lambda x: t_id2t_meta[x][0])
df['ttd_diseases_set'] = df['ttd_target_id'].map(lambda x: t_id2t_meta[x][1])

In [37]:
df.head()

Unnamed: 0,disease,target,ttd_target_id,ttd_target_name,ttd_diseases_set
0,Acute myocardial infarction,endothelial growth factor,T99993,HUMAN vascular endothelial growth factor (VEGF),"{COVID19 ICD11 1D6Y , Colorectal cancer ICD..."
1,Acute myocardial infarction,vascular endothelial growth factor,T99993,HUMAN vascular endothelial growth factor (VEGF),"{COVID19 ICD11 1D6Y , Colorectal cancer ICD..."
2,Psoriatic arthritis,Protein kinase,T06093,Rho-associated protein kinase 2 (ROCK2),"{Rheumatoid arthritis ICD11 FA20 , Chronic o..."
3,Psoriatic arthritis,cytokine,T06542,Cytokine ML-1 (IL17F),"{Psoriasis ICD11 EA90 , Psoriatic arthritis ..."
4,Chronic kidney disease,protein kinase,T51282,Rho-associated protein kinase 1 (ROCK1),"{Pulmonary hypertension ICD11 BB01 , Chronic..."


In [38]:
df.to_csv('matched_target_meta.csv')

In [40]:
len(set(df['disease']))

46

## Conclusion

* Targets are mached with TTD only for 46 diseases
* Matching rule (disease to disease and gene 2 gene) should be replaced with embedding based comparison

