In [1]:
import rltk
import csv
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [6]:
class ArtsyRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['artist_name']

    @rltk.cached_property
    def name_tokens(self):
        return list(tokenizer.tokenize(self.name_string))

class WidewallsRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['artist_names']
    
    @rltk.cached_property
    def name_tokens(self):
        return list(tokenizer.tokenize(self.name_string))

In [11]:
artsy_data_path = '../Scrape/Artsy/Data/ArtsyArtist.csv'
ww_data_path = '../Scrape/Widewalls/data/CleanedWidewallsData.csv'

ds1 = rltk.Dataset(rltk.CSVReader(artsy_data_path),record_class=ArtsyRecord)
ds2 = rltk.Dataset(rltk.CSVReader(ww_data_path),record_class=WidewallsRecord)

In [12]:
#no blocking needed for these dataset
#linking function
def same_name(r1, r2):
    if r1.name_string == r2.name_string:
        return True
    return False

def jaccard(r1, r2):
    list1 = list(r1.name_tokens)
    list2 = list(r2.name_tokens)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [13]:
def rule_based_method(r1, r2):
    if same_name(r1,r2):
        return True,1
    else:
        if jaccard(r1, r2) >= 0.8:
            return True, jaccard(r1, r2)
    return False,1-jaccard(r1, r2)

In [10]:
with open('artist_predictions.csv', mode='w') as file:
    candidate_pairs = rltk.get_record_pairs(ds1, ds2)
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Artsy","Widewalls"])
    for r1, r2 in candidate_pairs:
        result, confidence = rule_based_method(r1, r2)
        if result  == True:
            writer.writerow([r1.id,r2.id,confidence])