In [1]:
import pandas as pd
import sys
import os
import re

In [4]:
profile = sys.argv[1]
profile = "all"
stopwords = ['abnormally','abnormal','aberrant','variant']
outdir = "../curation/data"
uphenorelease_dir = "../../../upheno-release/"

## IN
upheno_mapping_logical = os.path.join(uphenorelease_dir,"upheno_mapping_logical.csv")
upheno_species_lexical_file = os.path.join(uphenorelease_dir,"upheno_species_lexical.csv")
print(upheno_species_lexical_file)
## OUT
upheno_mapping_all = os.path.join(uphenorelease_dir,"upheno_mapping_all.csv")
upheno_mapping_lexical = os.path.join(uphenorelease_dir,"upheno_mapping_lexical.csv")
upheno_mapping_lexical_template = os.path.join(uphenorelease_dir,"upheno_mapping_lexical_template.csv")
upheno_mapping_problematic = os.path.join(uphenorelease_dir,"upheno_mapping_problematic.csv")
    
## Load lexical data
df = pd.read_csv(upheno_species_lexical_file)
print(df)
df.columns = ['iri','p','label']

## Load logical mappings
dfl1 = pd.read_csv(upheno_mapping_logical)[['p1','p2']]
dfl2 = dfl1.copy()
dfl2.columns = ['p2','p1']
dfl = pd.concat([dfl1, dfl2], ignore_index=True, sort =False)
dfl = dfl.drop_duplicates()
dfl['cat']="logical"

## Prepare dataframe for labels
df_label = df[df['p']=="http://www.w3.org/2000/01/rdf-schema#label"][['iri','label']]
df_label.columns = ['iri','label']

../../../upheno-release/upheno_species_lexical.csv
                                                    s  \
0       http://purl.obolibrary.org/obo/UPHENO_0001001   
1       http://purl.obolibrary.org/obo/UPHENO_0001001   
2       http://purl.obolibrary.org/obo/UPHENO_0001003   
3       http://purl.obolibrary.org/obo/UPHENO_0001005   
4           http://purl.obolibrary.org/obo/MP_0001417   
...                                               ...   
208112  http://purl.obolibrary.org/obo/UPHENO_0076697   
208113  http://purl.obolibrary.org/obo/UPHENO_0081568   
208114  http://purl.obolibrary.org/obo/UPHENO_0002948   
208115  http://purl.obolibrary.org/obo/UPHENO_0075774   
208116  http://purl.obolibrary.org/obo/UPHENO_0002848   

                                                        p  \
0              http://www.w3.org/2000/01/rdf-schema#label   
1              http://www.w3.org/2000/01/rdf-schema#label   
2              http://www.w3.org/2000/01/rdf-schema#label   
3              http:

In [5]:
# Preprocess labels. The most important aspect to this the stopword removal. this is done by matching a stopword
# that means 'abnormal', removing it and then adding the actual prefix 'abnormal'. For example, "cell morphology, aberrant"
# will become 'abnormal cell morphology'. Other than that, most special characters other than space and the ' tick-mark
# Are removed

def apply_stopword(x, stopword):
    if x:
        if stopword in x:
            x = "abnormal "+x.replace(stopword, '')
    return x

UPHENO_PREFIX = "http://purl.obolibrary.org/obo/UPHENO_"

def preprocess_labels(df, stopwords):
    df['label'] = df['label'].astype(str)
    df['label_pp'] = df['label'].str.replace(r"[(][A-Z]+[)]", "")
    df['label_pp'] = df['label_pp'].str.lower()
    df['label_pp'] = df['label_pp'].str.replace(r"[^0-9a-z' ]", "")

    for stopword in stopwords:
        df['label_pp'] = df['label_pp'].apply(lambda x: apply_stopword(x,stopword))

    df['label_pp'] = df['label_pp'].str.strip()
    df['label_pp'] = df['label_pp'].str.replace(r"[ ]+", " ")
    df=df[~df['iri'].astype(str).str.startswith(UPHENO_PREFIX)]
    df=df[df['label_pp']!=""]
    d=df[['iri','label_pp']]
    d.columns=['iri','label']
    d=d.drop_duplicates()
    return d

d = preprocess_labels(df,stopwords)
l = df_label[~df_label['iri'].astype(str).str.startswith(UPHENO_PREFIX)]
print(len(d))

175815


In [6]:
dd=d.groupby('label')['iri'].apply(list).to_dict()

In [7]:
dd

{"'cigarette paper scarring'": ['http://purl.obolibrary.org/obo/HP_0001073'],
 "'curvilinear profiles' ultrastructurally": ['http://purl.obolibrary.org/obo/HP_0003205'],
 "'curvilinear profiles' ultrastructurally in cells": ['http://purl.obolibrary.org/obo/HP_0003205'],
 "'de toni-fanconi-debre' syndrome": ['http://purl.obolibrary.org/obo/HP_0001994'],
 "'decreased jaw size'": ['http://purl.obolibrary.org/obo/MP_0002639'],
 "'decreased mandible length'": ['http://purl.obolibrary.org/obo/MP_0004592'],
 "'decreased maxilla length'": ['http://purl.obolibrary.org/obo/MP_0000097'],
 "'decreased maxilla size'": ['http://purl.obolibrary.org/obo/MP_0004540'],
 "'decreased tibia length'": ['http://purl.obolibrary.org/obo/MP_0002764'],
 "'decreased vertebral body length'": ['http://purl.obolibrary.org/obo/MP_0004706'],
 "'fingerprint profiles' ultrastructurally in cells": ['http://purl.obolibrary.org/obo/HP_0003208'],
 "'generalised' tonic-clonic seizure with focal onset": ['http://purl.obolibra

In [5]:
# This step is a complicated hack that tries to get rid of them of the false exact synonyms. 
# The idea is this: if there is an exact synonym between two terms within an ontology, we get rid of the link. 
# Sometimes, however, a synonym is shared between more than one term within and ontology and across: 
# These cases need to be

import re

def get_dupes(a):
    seen = {}
    dupes = []

    for x in a:
        if x not in seen:
            seen[x] = 1
        else:
            if seen[x] == 1:
                dupes.append(x)
            seen[x] += 1
    return dupes

cases = dict()
cases_internal = dict()
i = 0

exclude_synonyms = dict()

for label in dd:
    iris = dd.get(label)
    onts = [re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")) for iri in iris]
    if len(onts)>1:
        if len(onts) != len(set(onts)):
            if len(set(onts))>1:
                cases[label] = iris
                print("-----------------------")
                print(label)
                print(iris)
                dupes = get_dupes(onts)
                for dupe in dupes:
                    for iri in iris:
                        if dupe in iri:
                            if label not in exclude_synonyms:
                                exclude_synonyms[label]=[]
                            exclude_synonyms[label].append(iri)
            else:
                cases_internal[label] = iris
                for iri in iris:
                    if label not in exclude_synonyms:
                        exclude_synonyms[label]=[]
                    exclude_synonyms[label].append(iri)


print(len(cases_internal))
print(len(cases))
print(len(dd))

-----------------------
asd
['http://purl.obolibrary.org/obo/HP_0000729', 'http://purl.obolibrary.org/obo/HP_0001631', 'http://purl.obolibrary.org/obo/MP_0010403']
6
1
174856


In [6]:
x = d

In [7]:
# Remove all those IRIs that contained duplicates determined in the previous step
d=x
print(len(d))
for label in exclude_synonyms:
    for iri in exclude_synonyms[label]:
        d = d[~((d['iri']==iri) & (d['label']==label))]
print(len(d))
d = pd.merge(d,l,on=['iri','label'],how='outer')
print(len(d))

175815
175801
275738


In [8]:
dd=d.groupby('label')['iri'].apply(list).to_dict()

In [9]:
# 
def pairwise(t):
    it = iter(t)
    return zip(it,it)

def invert_dol_nonunique(d):
    newdict = {}
    for k in d:
        for v in d[k]:
            newdict.setdefault(v, []).append(k)
    return newdict

def merge_label_equivalent_cliques(dd_rv):
    merge_labels = dict()
    for iri in dd_rv:
        labels_to_merge = dd_rv.get(iri)
        if len(labels_to_merge)>1:
            for lab in labels_to_merge:
                if lab not in merge_labels:
                    merge_labels[lab] = []
                merge_labels[lab] = list(set(merge_labels[lab]+labels_to_merge))
    return merge_labels

dd_rv = invert_dol_nonunique(dd)
merge_labels = merge_label_equivalent_cliques(dd_rv)



In [15]:
def compute_mappings(dd,l):
    data = []
    done = set()
    for label in dd:
        if label in done:
            continue
        done.add(label)
        iris = dd.get(label)
        if label in merge_labels:
            for lab in merge_labels[label]:
                iris.extend(dd.get(lab))
                done.add(lab)
        iris = list(set(iris))
        if len(iris)>1:
            #print(iris)
            pairs = pairwise(iris)
            for pair in pairs:
                data.append([pair[0], pair[1]])
                data.append([pair[1], pair[0]])
    df_mappings =  pd.DataFrame.from_records(data)
    df_mappings = df_mappings.drop_duplicates()
    df_mappings['cat'] = 'lexical'
    df_mappings.columns = ['p1','p2','cat']
    df_maps = pd.merge(df_mappings,l,  how='left', left_on=['p1'], right_on=['iri'])
    # df_maps=df_maps.drop('iri',1)
    df_maps = pd.merge(df_maps, l,  how='left', left_on=['p2'], right_on=['iri'])
    # df_maps=df_maps.drop('iri',1)
    df_maps['o1']=[re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")) for iri in df_maps['p1'].values]
    df_maps['o2']=[re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")) for iri in df_maps['p2'].values]
    return df_maps

df_mapping = compute_mappings(dd,l)
print(len(df_mapping))
df_mapping.head()

1424


Unnamed: 0,p1,p2,cat,iri_x,label_x,iri_y,label_y,o1,o2
0,http://purl.obolibrary.org/obo/MP_0004592,http://purl.obolibrary.org/obo/HP_0000347,lexical,http://purl.obolibrary.org/obo/MP_0004592,small mandible (MPO),http://purl.obolibrary.org/obo/HP_0000347,Micrognathia (HPO),MP,HP
1,http://purl.obolibrary.org/obo/HP_0000347,http://purl.obolibrary.org/obo/MP_0004592,lexical,http://purl.obolibrary.org/obo/HP_0000347,Micrognathia (HPO),http://purl.obolibrary.org/obo/MP_0004592,small mandible (MPO),HP,MP
2,http://purl.obolibrary.org/obo/HP_0003270,http://purl.obolibrary.org/obo/MP_0009247,lexical,http://purl.obolibrary.org/obo/HP_0003270,Abdominal distention (HPO),http://purl.obolibrary.org/obo/MP_0009247,meteorism (MPO),HP,MP
3,http://purl.obolibrary.org/obo/MP_0009247,http://purl.obolibrary.org/obo/HP_0003270,lexical,http://purl.obolibrary.org/obo/MP_0009247,meteorism (MPO),http://purl.obolibrary.org/obo/HP_0003270,Abdominal distention (HPO),MP,HP
4,http://purl.obolibrary.org/obo/HP_0003363,http://purl.obolibrary.org/obo/MP_0002766,lexical,http://purl.obolibrary.org/obo/HP_0003363,Abdominal situs inversus (HPO),http://purl.obolibrary.org/obo/MP_0002766,situs inversus (MPO),HP,MP
