In [20]:
import rltk
import csv
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [35]:
class ArtsyRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['artist_name']

    @rltk.cached_property
    def name_tokens(self):
        return list(tokenizer.tokenize(self.name_string))

class WidewallsRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['artist_names']
    
    def artist_bio(self):
        return self.raw_object['full_bios']
    
    @rltk.cached_property
    def name_tokens(self):
        return list(tokenizer.tokenize(self.name_string))

In [36]:
artsy_data_path = '../Scrape/Artsy/Data/ArtsyArtist.csv'
ww_data_path = '../Scrape/Widewalls/data/CleanedWidewallsData.csv'

ds1 = rltk.Dataset(rltk.CSVReader(artsy_data_path),record_class=ArtsyRecord)
ds2 = rltk.Dataset(rltk.CSVReader(ww_data_path),record_class=WidewallsRecord)

In [59]:
dev_set_file = 'Linking_Truth.csv'
dev = []
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if row[-1] != "label":
            row = row[:-1]+[int(row[-1])]
        row = row[1:]
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            dev.append(row)
            
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')
#print(dev)
gt = rltk.GroundTruth()
for row in dev:    
    r1 = ds1.get_record(row[0])
    r2  = ds2.get_record(row[1])
    if row[-1] == 1:
        gt.add_positive(r1.raw_object['id'], r2.raw_object['id'])
        #print(r1.raw_object['id'])
    else:
        gt.add_negative(r1.raw_object['id'], r2.raw_object['id'])

rltk.Trial(gt)

Column names are: Artsy, Widewalls, label
Processed 100 lines.


<rltk.evaluation.trial.Trial at 0x28ae40280>

In [60]:
#no blocking needed for these dataset
#linking function
def same_name(r1, r2):
    if r1.name_string == r2.name_string:
        return True
    return False

def jaccard(r1, r2):
    list1 = list(r1.name_tokens)
    list2 = list(r2.name_tokens)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [61]:
def rule_based_method(r1, r2):
    if same_name(r1,r2):
        return True,1
    else:
        if jaccard(r1, r2) >= 0.8:
            return True, jaccard(r1, r2)
    return False,1-jaccard(r1, r2)

In [45]:
with open('data/artist_predictions.csv', mode='w') as file:
    candidate_pairs = rltk.get_record_pairs(ds1, ds2)
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Artsy","Widewalls","artist_name"])
    for r1, r2 in candidate_pairs:
        result, confidence = rule_based_method(r1, r2)
        if result  == True:
            writer.writerow([r1.id,r2.id,r1.name_string])

In [62]:
trial = rltk.Trial(gt)
candidate_pairs = rltk.get_record_pairs(ds1,ds2, ground_truth=gt)
for r1, r2 in candidate_pairs:
    result, confidence = rule_based_method(r1, r2)
    print(r1.name,result)
    trial.add_result(r1, r2, result)

 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 True
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False
 False


In [63]:
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

Trial statistics based on Ground-Truth from development set data:
tp: 1.000000 [50]
fp: 0.000000 [0]
tn: 1.000000 [50]
fn: 0.000000 [0]


In [64]:
trial.f_measure

1.0