In [27]:
from pathlib import Path
from typing import *
from re import sub as re_sub
import sys
import json
import rltk
from collections import defaultdict

global g_tokenizer
g_tokenizer = rltk.CrfTokenizer()

class DocRecord(rltk.Record):
    ''' Record entry class for each of our IMDB records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['DoctorURI']

    @rltk.cached_property
    def health_sp(self):
        return self.raw_object['Doctor_Speciality']

class DiseaseRecord(rltk.Record):
    ''' Record entry class for each of our AFI records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['DiseaseURI']

    @rltk.cached_property
    def health_sp_list(self):
        return self.raw_object['HealthSpecialty']


def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset:
    ''' Create rltk dataset from a given jl file '''
    assert Path(input_file).suffix == ".jl"
    return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())

def remove_words(s):
    s=s.lower()
    omit_words=['medical','medicine','disease','care']
    for i in omit_words:
        if i in s:
            s=s.replace(i, '')
    s=list(s.split())
    return s

def lister(s_list):
    new_list=[]
    for x in s_list:
        new_x=remove_words(x)
        new_list.append(new_x)
    return new_list

#def get_ground_truth(input_file: str, ds1: rltk.Dataset, ds2: rltk.Dataset) -> rltk.GroundTruth:
#    ''' Read the grouth truth from the given input file '''
#    devset_file_handle = open(input_file, "r")
#    devset_data = json.load(devset_file_handle)
#    gt = rltk.GroundTruth()
#    for item in devset_data:
#        if None != item['afi_movie']:
#            r_imdb = ds1.get_record(item['imdb_movie'])
#            r_afi  = ds2.get_record(item['afi_movie'])
#            gt.add_positive(r_imdb.raw_object['url'], r_afi.raw_object['url'])
#    return gt

def health_similarity(r_disease,r_doc):
    health_list=r_disease.health_sp_list
    hp_sp_name=r_doc.health_sp
    
    health_list=lister(health_list)
    health_name=hp_sp_name
    health_name=remove_words(health_name)
    res = []
    
    for x in health_name:
        for i in health_list:
            for j in i:
                sim= rltk.levenshtein_similarity(x,j)
                if sim>=0.8:
                    res.append((True,(hp_sp_name.lower(),' '.join(list(i)))))
    return res
    
r_disease_file = "/Users/pratheek/Documents/Knowledge Graphs(DSCI 558)/Project/wikidata_test.jl"
r_doc_file = "/Users/pratheek/Documents/Knowledge Graphs(DSCI 558)/Project/doctor_testtest.jl"

ds_disease = create_dataset(r_disease_file, DiseaseRecord)
ds_doc = create_dataset(r_doc_file, DocRecord)

bg = rltk.TokenBlockGenerator()

def n_gram(s, n=4):
    
    return [s[i:i + n] for i in range(len(s) - (n - 1))]
def n_gram_list(s, n=4):
    s =''.join(s)
    return [s[i:i + n] for i in range(len(s) - (n - 1))]


block = bg.generate(
bg.block(ds_disease, function_ = lambda r:n_gram_list(r.health_sp_list,5)),
bg.block(ds_doc, function_=lambda r: n_gram(r.health_sp, 5))

)
pairs = list(set(block.pairwise(ds_disease,ds_doc)))
pred_dic=defaultdict(set)
for a,b,c in pairs:
    r_dis = ds_disease.get_record(b)
    r_doc = ds_doc.get_record(c)
    value=health_similarity(r_dis, r_doc)      
    for val in value:
        pred_dic[(val[1][1])].add(r_doc.id)
    
pred_dic = defaultdict(list, ((k, list(v)) for k, v in pred_dic.items()))

with open('/Users/pratheek/Documents/Knowledge Graphs(DSCI 558)/Project/Test1.jl','w') as op_file:
    json.dump(pred_dic,op_file,indent=2)


In [24]:
pred_dic

defaultdict(list,
            {'nutrition': ['https://www.healthgrades.com/providers/angana-shah-y9q62hz',
              'https://www.healthgrades.com/providers/kelly-stapleton-xynvvnh'],
             'oncology': ['https://www.healthgrades.com/physician/dr-shahrooz-eshaghian-ysdgp',
              'https://www.healthgrades.com/physician/dr-huyen-pham-3cb8b',
              'https://www.healthgrades.com/physician/dr-david-quinn-3d6vp'],
             'gastroenterology': ['https://www.healthgrades.com/physician/dr-payman-khorrami-2lgfd',
              'https://www.healthgrades.com/physician/dr-armine-sarkisian-y387b',
              'https://www.healthgrades.com/providers/mary-pardee-xyp4pyw'],
             'psychiatry': ['https://www.healthgrades.com/physician/dr-milica-stefanovic-3b3cn',
              'https://www.healthgrades.com/physician/dr-kwang-park-2bwh5',
              'https://www.healthgrades.com/physician/dr-emily-dossett-2chfr',
              'https://www.healthgrades.com/physic