In [1]:
def n_gram(s, n=3):
    return [s[i:i + n] for i in range(len(s) - (n - 1))]


# bg = rltk.TokenBlockGenerator()

# block = bg.generate(
# bg.block(ds_wiki, function_ = lambda r:n_gram(r.name_string,3)),
# bg.block(ds_webmd, function_=lambda r: n_gram(r.genname_string, 3))

# )
    


In [3]:
from pathlib import Path
from typing import *
from re import sub as re_sub
import sys
import json
import rltk
from collections import defaultdict

global g_tokenizer
g_tokenizer = rltk.CrfTokenizer()

class WikiRecord(rltk.Record):
    ''' Record entry class for each of our IMDB records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['MedicineURI']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['Medicine'].lower()

class WebMDRecord(rltk.Record):
    ''' Record entry class for each of our AFI records '''
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['url']

    @rltk.cached_property
    def genname_string(self):
        return self.raw_object['Generic_Name']

    @rltk.cached_property
    def brandname_string(self):
        return self.raw_object['Brand_Name'].lower()

def n_gram(s, n=3):
    return [s[i:i + n] for i in range(len(s) - (n - 1))]


def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset:
    ''' Create rltk dataset from a given jl file '''
    assert Path(input_file).suffix == ".jl"
    return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())


#def get_ground_truth(input_file: str, ds1: rltk.Dataset, ds2: rltk.Dataset) -> rltk.GroundTruth:
#    ''' Read the grouth truth from the given input file '''
#    devset_file_handle = open(input_file, "r")
#    devset_data = json.load(devset_file_handle)
#    gt = rltk.GroundTruth()
#    for item in devset_data:
#        if None != item['afi_movie']:
#            r_imdb = ds1.get_record(item['imdb_movie'])
#            r_afi  = ds2.get_record(item['afi_movie'])
#            gt.add_positive(r_imdb.raw_object['url'], r_afi.raw_object['url'])
#    return gt

def med_name_similarity(r_wiki,r_webmd):
    name_wiki=r_wiki.name_string.lower()
    name_wedgenmd=r_webmd.genname_string.lower()
    name_wedbrandmd=r_webmd.brandname_string.lower()

    name_wedgenmd=name_wedgenmd.replace(" ", "")
    name_wedbrandmd=name_wedbrandmd.replace(" ", "")


    if ' ' in name_wiki:
        name_wiki=name_wiki.split(' ')
        name_wiki=max(name_wiki, key=len)
    elif '-' in name_wiki:
        name_wiki=name_wiki.split('-')
        name_wiki=max(name_wiki, key=len)
    elif '/' in name_wiki:
        name_wiki=name_wiki.split('/')
        name_wiki=max(name_wiki, key=len)

    if name_wiki in name_wedgenmd or name_wiki in name_wedbrandmd :
        return True
    else:
        return False



wiki_file = "/Users/sharadsharma/Documents/KG/Project/WikiMedTest.jl"
webmd_file = "/Users/sharadsharma/Documents/KG/Project/WebMDTest.jl"
ds_wiki = create_dataset(wiki_file, WikiRecord)
ds_webmd = create_dataset(webmd_file, WebMDRecord)




bg = rltk.TokenBlockGenerator()

block = bg.generate(
bg.block(ds_wiki, function_ = lambda r:n_gram(r.name_string,3)),
bg.block(ds_webmd, function_=lambda r: n_gram(r.genname_string, 3))

)
block2  = bg.generate(
bg.block(ds_wiki, function_ = lambda r:n_gram(r.name_string,3)),
bg.block(ds_webmd, function_=lambda r: n_gram(r.brandname_string, 3))

)

    
pred_dic = defaultdict(set)
pairs =list(set(block.pairwise(ds_wiki,ds_webmd)))
pairs_2 = list(set(block2.pairwise(ds_wiki,ds_webmd)))
pairs.extend(pairs_2)


for a,b,c in pairs:
      
    flag = False
    r_wiki = ds_wiki.get_record(b)
    r_webmd = ds_webmd.get_record(c)

    value=med_name_similarity(r_wiki, r_webmd)
    if value==True:
        flag=True
       
        
        
        pred_dic[b].add(c)


    if flag==False:
        pred_dic[b].add(None)

for k,v in pred_dic.items():
    if len(pred_dic[k]) >1:
        pred_dic[k].remove(None)
res = []
for k,v in pred_dic.items():
    temp = {}
    temp['wiki_url'] =k
    temp['webmd_url'] = list(v)[0]
    res.append(temp)
    
   

with open('/Users/sharadsharma/Documents/KG/Project/MedicineLinkageTest.jl','w') as op_file:
     json.dump(res,op_file,indent=2)
