In [1]:
# start with output of notebook 2 from
# https://github.com/mmayers12/semmed/tree/master/nbs
# nodes.csv and edges.csv

In [2]:
# convert the pmids columns from a set of strings to a single semicolon delimited string
# change the neg to the same prop without neg and add a neg column
# make every edge uni-directional and remove ">"
# make columns for domain pred range

In [2]:
import os
import pickle
%matplotlib inline
import pandas as pd
import re
import seaborn as sns
from tqdm import tqdm

In [4]:
def sanitize(x):
    """Some pmids have the appearance of '2015332 [3]' for some reason. This fixes that"""
    if type(x) == str:
        if ' ' in x:
            x = x.split(' ')[0]
    return x

In [5]:
x = !wc -l edges.csv
wc = int(x[0].split(" ")[0])

In [17]:
# can't load into ram. too big. It doesn't like pmid column. do in chunks
# convert the pmids columns from a set of string to a single semicolon delimited string
edges = pd.read_csv('edges.csv', converters={'pmids':eval}, chunksize=100000)
df = pd.read_csv('edges.csv', nrows=0)
df.rename(columns={':START_ID': 'START_ID', ':END_ID': 'END_ID', ':TYPE': 'TYPE'}, inplace=True)
# write out column headers
df.to_csv("edges_pmid.csv", index=0)
for df in tqdm(edges, total=wc/100000):
    df['pmids'] = df['pmids'].apply(lambda ids: set([str(sanitize(x)) for x in ids]))
    df['pmids'] = df['pmids'].apply(lambda x:";".join(x))
    df.to_csv("edges_pmid.csv", mode='a', header=False, index=0)



  0%|          | 0/103.100565 [00:00<?, ?it/s][A[A


  1%|          | 1/103.100565 [00:04<07:31,  4.42s/it][A[A

  2%|▏         | 2/103.100565 [00:07<06:08,  3.64s/it][A[A

  3%|▎         | 3/103.100565 [00:09<05:30,  3.30s/it][A[A

  4%|▍         | 4/103.100565 [00:12<05:20,  3.23s/it][A[A

  5%|▍         | 5/103.100565 [00:15<05:11,  3.18s/it][A[A

  6%|▌         | 6/103.100565 [00:18<05:04,  3.14s/it][A[A

  7%|▋         | 7/103.100565 [00:21<04:57,  3.09s/it][A[A

  8%|▊         | 8/103.100565 [00:24<04:51,  3.07s/it][A[A

  9%|▊         | 9/103.100565 [00:27<04:47,  3.06s/it][A[A

 10%|▉         | 10/103.100565 [00:30<04:46,  3.07s/it][A[A

 11%|█         | 11/103.100565 [00:33<04:44,  3.09s/it][A[A

 12%|█▏        | 12/103.100565 [00:37<04:40,  3.08s/it][A[A

 13%|█▎        | 13/103.100565 [00:40<04:37,  3.08s/it][A[A

 14%|█▎        | 14/103.100565 [00:43<04:33,  3.07s/it][A[A

 15%|█▍        | 15/103.100565 [00:45<04:28,  3.05s/it][A[A

 16%|█▌

In [4]:
# now we can load the whole thing
edges = pd.read_csv("edges_pmid.csv")
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids
0,C1273870,C0282623,ADMINISTERED_TO_ABatAB,17170614,1
1,C0935546,C0221192,ADMINISTERED_TO_ABatCI,15856663,1
2,C1273870,C1138603,ADMINISTERED_TO_ABatCI,12943031;15842188,2
3,C1273870,C0221192,ADMINISTERED_TO_ABatCI,25503364;22250100;24417454;16723949;19337198;2...,53
4,C0556656,C0221192,ADMINISTERED_TO_ABatCI,17684606;16230038;17323301;14707659;17530066;1...,7


In [5]:
# change the neg to the same prop without neg and add a neg column
idx = edges["TYPE"].str.startswith("NEG_")
edges['NEG'] = False
edges.loc[idx, 'NEG'] = True
edges.loc[idx, 'TYPE'] = edges[idx].TYPE.str.replace("NEG_", "")
edges[edges.NEG].head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG
12500123,C0036592,C0027361,ADMINISTERED_TO_ABnatLB,24526037,1,True
12500124,C1273870,C0043210,ADMINISTERED_TO_ABnatLB,21627742;9576289;9824792;16864466,4,True
12500125,C0441648,C0679646,ADMINISTERED_TO_ABnatLB,25314278,1,True
12500126,C1273870,C0334885,ADMINISTERED_TO_ABnatLB,24214789,1,True
12500127,C0035028,C0030705,ADMINISTERED_TO_ABnatLB,20562622,1,True


In [6]:
# make every edge uni-directional and remove ">"
edges.TYPE = edges.TYPE.str.replace(">", "")

In [7]:
# get columns for domain, pred, range
rec = re.compile(r'([a-z]+)')
split_dr = lambda x: re.split(rec, x.rsplit("_", 1)[1])
print(split_dr("ADMINISTERED_TO_ABatAB"))
der = edges.TYPE.apply(split_dr)
der_df = pd.DataFrame(der.tolist(), columns=['domain', 'pred', 'range'])
edges['DOMAIN'] = der_df.domain
edges['PRED'] = der_df.pred
edges['RANGE'] = der_df.range

['AB', 'at', 'AB']


In [10]:
edges[edges.NEG].head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE
12500123,C0036592,C0027361,ADMINISTERED_TO_ABnatLB,24526037,1,True,AB,nat,LB
12500124,C1273870,C0043210,ADMINISTERED_TO_ABnatLB,21627742;9576289;9824792;16864466,4,True,AB,nat,LB
12500125,C0441648,C0679646,ADMINISTERED_TO_ABnatLB,25314278,1,True,AB,nat,LB
12500126,C1273870,C0334885,ADMINISTERED_TO_ABnatLB,24214789,1,True,AB,nat,LB
12500127,C0035028,C0030705,ADMINISTERED_TO_ABnatLB,20562622,1,True,AB,nat,LB


In [11]:
# fix the pred abv for negative edges
edges.loc[edges.NEG, 'PRED'] = edges.loc[edges.NEG, 'PRED'].apply(lambda x:x[1:])

In [13]:
edges[edges.NEG].head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE
12500123,C0036592,C0027361,ADMINISTERED_TO_ABnatLB,24526037,1,True,AB,at,LB
12500124,C1273870,C0043210,ADMINISTERED_TO_ABnatLB,21627742;9576289;9824792;16864466,4,True,AB,at,LB
12500125,C0441648,C0679646,ADMINISTERED_TO_ABnatLB,25314278,1,True,AB,at,LB
12500126,C1273870,C0334885,ADMINISTERED_TO_ABnatLB,24214789,1,True,AB,at,LB
12500127,C0035028,C0030705,ADMINISTERED_TO_ABnatLB,20562622,1,True,AB,at,LB


In [16]:
# recreate the type column
edges['TYPE'] = edges.DOMAIN + edges.PRED + edges.RANGE

In [17]:
edges[edges.NEG].head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE
12500123,C0036592,C0027361,ABatLB,24526037,1,True,AB,at,LB
12500124,C1273870,C0043210,ABatLB,21627742;9576289;9824792;16864466,4,True,AB,at,LB
12500125,C0441648,C0679646,ABatLB,25314278,1,True,AB,at,LB
12500126,C1273870,C0334885,ABatLB,24214789,1,True,AB,at,LB
12500127,C0035028,C0030705,ABatLB,20562622,1,True,AB,at,LB


In [22]:
# edges['PRED_FULL'] = edges.TYPE.map(lambda x:x.rsplit("_",1)[0])

In [18]:
# dump back out
edges.to_csv("edges_sanitized.csv", index=None)

In [4]:
# rename nodes columns
nodes = pd.read_csv('nodes.csv')
nodes = nodes.rename(columns={':ID': 'ID', ':LABEL': 'TYPE', 'name': 'label'})
nodes.to_csv("nodes_sanitized.csv", index=None)

In [26]:
# save abbreviations
type_abv = {
 'Activities & Behaviors': 'AB',
 'Anatomy': 'A',
 'Compound': 'C',
 'Chemicals & Drugs': 'CD',
 'Concepts & Ideas': 'CI',
 'Devices': 'DV',
 'Disease': 'D',
 'Disorders': 'DO',
 'Genes & Molecular Sequences': 'G',
 'Geographic Areas': 'GA',
 'Living Beings': 'LB',
 'Objects': 'OB',
 'Occupations': 'OC',
 'Organizations': 'OR',
 'Phenomena': 'PH',
 'Physiology': 'PS',
 'Procedures': 'PR'
}
abv_type = {v:k for k,v in type_abv.items()}

In [27]:
pred_abr = {
 'ADMINISTERED_TO': 'at',
 'AFFECTS': 'af',
 'ASSOCIATED_WITH': 'aw',
 'AUGMENTS': 'ag',
 'CAUSES': 'c',
 'COEXISTS_WITH': 'cw',
 'COMPLICATES': 'cp',
 'CONVERTS_TO': 'ct',
 'DIAGNOSES': 'dg',
 'DISRUPTS': 'ds',
 'INHIBITS': 'in',
 'INTERACTS_WITH': 'iw',
 'ISA': 'i',
 'LOCATION_OF': 'lo',
 'MANIFESTATION_OF': 'mfo',
 'METHOD_OF': 'mo',
 'OCCURS_IN': 'oi',
 'PART_OF': 'po',
 'PRECEDES': 'pc',
 'PREDISPOSES': 'ps',
 'PREP': 'pr',
 'PREVENTS': 'pv',
 'PROCESS_OF': 'pro',
 'PRODUCES': 'pd',
 'STIMULATES': 'st',
 'TREATS': 't',
 'USES': 'u',
 'compared_with': 'cpw',
 'different_from': 'df',
 'different_than': 'dt',
 'higher_than': 'ht',
 'lower_than': 'lt',
 'same_as': 'sa'}
abv_pred = {v:k for k,v in pred_abr.items()}

In [34]:
abv = pd.DataFrame(list(pred_abr.items()) + list(type_abv.items()), columns=['full_name', 'abv'])
abv.to_csv("abv.csv", index=None)
abv.head()

Unnamed: 0,full_name,abv
0,INHIBITS,in
1,AFFECTS,af
2,LOCATION_OF,lo
3,METHOD_OF,mo
4,PROCESS_OF,pro
