# Vocabulary Generator from SNMI

In [None]:
import pip
# uncomment if rdflib not installed
#pip.main(['install', 'rdflib'])
#pip.main(['install', 'sparql-client'])
# importing libraries
import os
import pandas as pd
import json
import sparql

## Load CSV from Bioportal

In [None]:
snmi = pd.read_csv("SNMI.csv")


## Separate by Semantic Type (STY)

In [None]:
def getbysty(df,sty):
    if isinstance(sty, list):
        dflist = []
        for st in sty:
            dflist.append(df[df["Semantic Types"].str.contains("http://purl.bioontology.org/ontology/STY/" + st, na = False)])
        return pd.concat(dflist, ignore_index=True)
    else:
        return df[df["Semantic Types"].str.contains("http://purl.bioontology.org/ontology/STY/" + sty, na = False)]


def prune(instr):
    return instr.replace(", NOS","")


def todict(df):
    vocab ={}
    num = 0
    for index, row in df.iterrows():
        vocab[num] = {"ID":row["Class ID"], "properties": {
            "label": "lab",
            "synonyms": "syn",
            "associated": "nan",
            "parents": "parent"}}
        vocab[num].update({"ID":row["Class ID"]})
        vocab[num]["properties"].update({"label":prune(row["Preferred Label"])})
        assoc = str(row["Associated with"]).split("|")
        assoclist = []
        #if assoc[0]!="nan":
        vocab[num]["properties"].update({"associated":assoc[0]})
        synonyms = str(row["Synonyms"]).split("|")
        synlist = []
        if synonyms[0]!="nan":
            for s in synonyms:
                synlist.append(prune(s))
        else:
            synlist = ["nan"]
        vocab[num]["properties"].update({"synonyms":synlist})
        parent = str(row["Parents"]).split("|")
        vocab[num]["properties"].update({"parents":parent[0]})
        num +=1
    return vocab
    
# give semantic types either as string or list
# full list of semantic types: https://gist.github.com/joelkuiper/4869d148333f279c2b2e

viruses = todict(getbysty(snmi, "T005"))
bacteria = todict(getbysty(snmi, "T007"))
diseases = todict(getbysty(snmi, "T047"))

coronasynonyms = ["corona","COVID-19","COVID-19 virus","2019 novel coronavirus", "coronavirus","novel coronavirus","2019-nCoV","SARS-CoV-2", "corona virus", "severe acute respiratory syndrome coronavirus 2", "2019-novel Corona virus", "wuhan corona virus", "wuhan virus"]

corona = {0:{"ID":"http://purl.bioontology.org/ontology/SNMI/L-33502", "properties": {
            "label": "SARS-CoV-2",
            "synonyms": coronasynonyms,
            "associated": "nan",
            "parents": "http://purl.bioontology.org/ontology/SNMI/L-33500"}}}


In [None]:
with open('viruses.json', 'w') as f:
    json.dump(viruses, f)
    
with open('bacteria.json', 'w') as f:
    json.dump(bacteria, f)
    
with open('diseases.json', 'w') as f:
    json.dump(diseases, f)
    

with open('corona.json', 'w') as f:
    json.dump(corona, f)