In [None]:
from pathlib import Path
from typing import *
from re import sub as re_sub
import sys
import json
import rltk
from collections import defaultdict

global g_tokenizer
g_tokenizer = rltk.CrfTokenizer()

class DocRecord(rltk.Record):
    ''' Record entry class for each of our IMDB records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['DoctorURI']

    @rltk.cached_property
    def health_sp(self):
        return self.raw_object['Doctor_Speciality']

class DiseaseRecord(rltk.Record):
    ''' Record entry class for each of our AFI records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['DiseaseURI']

    @rltk.cached_property
    def health_sp_list(self):
        return self.raw_object['HealthSpecialty']


def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset:
    ''' Create rltk dataset from a given jl file '''
    assert Path(input_file).suffix == ".jl"
    return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())

def remove_words(s):
    s=s.lower()
    omit_words=['medical','care']
    for i in omit_words:
        if i in s:
            s=s.replace(i, '')
    s=list(s.split())
    return s

def lister(s_list):
    new_list=[]
    for x in s_list:
        new_x=remove_words(x)
        new_list.append(new_x)
    return new_list

#def get_ground_truth(input_file: str, ds1: rltk.Dataset, ds2: rltk.Dataset) -> rltk.GroundTruth:
#    ''' Read the grouth truth from the given input file '''
#    devset_file_handle = open(input_file, "r")
#    devset_data = json.load(devset_file_handle)
#    gt = rltk.GroundTruth()
#    for item in devset_data:
#        if None != item['afi_movie']:
#            r_imdb = ds1.get_record(item['imdb_movie'])
#            r_afi  = ds2.get_record(item['afi_movie'])
#            gt.add_positive(r_imdb.raw_object['url'], r_afi.raw_object['url'])
#    return gt

def health_similarity(r_disease,r_doc):
    health_list=r_disease.health_sp_list
    hp_sp_name=r_doc.health_sp
    
    health_list=lister(health_list)
    health_name=hp_sp_name
    health_name=remove_words(health_name)
    res = []
    
    checker=['medicine','disease']
    for x in health_name:
        if x in checker:
            continue
        for i in health_list:
            for j in i:
                if j in checker:
                    continue
                sim= rltk.levenshtein_similarity(x,j)
                if sim>=0.8:
                    res.append((True,(hp_sp_name.lower(),' '.join(list(i)))))
    return res
    
r_disease_file = "/Users/sharadsharma/Documents/KG/Project/OutputFiles/WikiDiseaseData.jl"
r_doc_file = "/Users/sharadsharma/Documents/KG/Project/OutputFiles/DoctorData.jl"

ds_disease = create_dataset(r_disease_file, DiseaseRecord)
ds_doc = create_dataset(r_doc_file, DocRecord)



pred_dic=defaultdict(set)
for r_dis in ds_disease:
    
    for r_doc in ds_doc:
            value=health_similarity(r_dis, r_doc)
           
            for val in value:
                pred_dic[(val[1][1])].add(r_doc.id)
    
pred_dic = defaultdict(list, ((k, list(v)) for k, v in pred_dic.items()))


with open('/Users/sharadsharma/Documents/KG/Project/OutputFiles/HealthSpecialtyLinkage.json','w') as op_file:
    json.dump(pred_dic,op_file,indent=2)
