## Entity Resolution - Record Linkage

Problem: After fetching nba players info, we found that some players' name do not match. For example, RJ Barret vs R.J Barret, Terry Rozier vs Terry Rozier III, etc.

### JSON to CSV

In [1]:
import pandas as pd

with open('../data_backup/hoopshype.json', encoding='utf-8') as inputfile:
    df1 = pd.read_json(inputfile)

In [2]:
df1.head()

Unnamed: 0,_id,label,_type,birthYear,salary,playsFor,position,thumbnail,hasPlayers
0,http://dbpedia.org/resource/Giannis_Antetokounmpo,Giannis Antetokounmpo,Person,1994.0,"$42,492,492",http://dbpedia.org/resource/Milwaukee_Bucks,F-G,https://cdn.hoopshype.com/i/5d/8b/62/giannis-a...,
1,http://dbpedia.org/resource/Jayson_Tatum,Jayson Tatum,Person,1998.0,"$30,351,780",http://dbpedia.org/resource/Boston_Celtics,F,https://cdn.hoopshype.com/i/f0/1c/c3/jayson-ta...,
2,http://dbpedia.org/resource/Nikola_Jokic,Nikola Jokic,Person,1995.0,"$33,047,803",http://dbpedia.org/resource/Denver_Nuggets,C,https://cdn.hoopshype.com/i/b1/40/fc/nikola-jo...,
3,http://dbpedia.org/resource/Jimmy_Butler,Jimmy Butler,Person,1989.0,"$37,653,300",http://dbpedia.org/resource/Miami_Heat,G,https://cdn.hoopshype.com/i/87/69/21/jimmy-but...,
4,http://dbpedia.org/resource/Joel_Embiid,Joel Embiid,Person,1994.0,"$33,616,770",http://dbpedia.org/resource/Philadelphia_76ers,C,https://cdn.hoopshype.com/i/a7/69/65/joel-embi...,


In [3]:
df1.to_csv('../data_backup/hoopshype.csv', encoding='utf-8', index=False)

In [4]:
import pandas as pd

with open('../data_backup/twoK.json', encoding='utf-8') as inputfile:
    df2 = pd.read_json(inputfile)

df2.head()

Unnamed: 0,_id,label,_type,hasRating,birthCountry,graduatedFrom,hasAlumni
0,http://dbpedia.org/resource/Giannis_Antetokounmpo,Giannis Antetokounmpo,Person,97.0,http://dbpedia.org/resource/Greece,,
1,http://dbpedia.org/resource/Jayson_Tatum,Jayson Tatum,Person,95.0,http://dbpedia.org/resource/United_States,http://dbpedia.org/resource/Duke_University,
2,http://dbpedia.org/resource/Kawhi_Leonard,Kawhi Leonard,Person,95.0,http://dbpedia.org/resource/United_States,http://dbpedia.org/resource/San_Diego_State_Un...,
3,http://dbpedia.org/resource/Joel_Embiid,Joel Embiid,Person,96.0,http://dbpedia.org/resource/Cameroon,http://dbpedia.org/resource/Kansas_University,
4,http://dbpedia.org/resource/Jimmy_Butler,Jimmy Butler,Person,93.0,http://dbpedia.org/resource/United_States,http://dbpedia.org/resource/Marquette_University,


In [5]:
df2.to_csv('../data_backup/twoK.csv', encoding='utf-8', index=False)

### Ground Truth

In [6]:
from random import randint

seen = set()
li = []

x, y = randint(0, 489), randint(0, 535)


for i in range(0, 100):
    seen.add((x, y))
    li.append([x, y])      
    x, y = randint(0, 490), randint(0, 536)
    
    while (x, y) in seen:
        x, y = randint(0, 490), randint(0, 536)



In [7]:
with open('dev.csv', 'w') as file:
    for l in li:
        file.write(str(l[0]) + ',' + str(l[1]) + ',0')
        file.write('\n')  


In [8]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [9]:
'''
Hoopshype
    id: _id
    label: label
    type: _type
    birthYear: birthYear
    salary: salary
    position: position
    thumbnail: thumbnail
    hasPlayers: hasPlayers

2K
    id: _id
    label: label
    type: _type
    hasRating: hasRating
    birthCountry: birthCountry
    graduatedFrom: graduatedFrom
    hasAlumni: hasAlumni

'''

class HoopshypeRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['label']

    @rltk.cached_property
    def type(self):
        return self.raw_object['_type']

    @rltk.cached_property
    def birthYear(self):
        return self.raw_object['birthYear']
    
    @rltk.cached_property
    def salary(self):
        return self.raw_object['salary']

    @rltk.cached_property
    def position(self):
        return self.raw_object['position']
    
    @rltk.cached_property
    def thumbnail(self):
        return self.raw_object['thumbnail']   

    @rltk.cached_property
    def hasPlayers(self):
        return self.raw_object['hasPlayers']   

    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name_string))

class TwoKRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['label']
    
    @rltk.cached_property
    def type(self):
        return self.raw_object['_type']

    @rltk.cached_property
    def rating(self):
        return self.raw_object['hasRating']
    
    @rltk.cached_property
    def birthCountry(self):
        return self.raw_object['birthCountry']
    
    @rltk.cached_property
    def graduatedFrom(self):
        return self.raw_object['graduatedFrom']

    @rltk.cached_property
    def hasAlumni(self):
        return self.raw_object['hasAlumni']
    

In [10]:
dir_ = '../data_backup/'
hoopshype_file = dir_ + 'hoopshype.csv'
twoK_file = dir_ + 'twoK.csv'

ds1 = rltk.Dataset(rltk.CSVReader(hoopshype_file),record_class=HoopshypeRecord)
ds2 = rltk.Dataset(rltk.CSVReader(twoK_file),record_class=TwoKRecord)

### Ground Truth

In [11]:
dev_set_file = dir_ + 'dev_.csv'
dev = []
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            dev.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')

Column names are: hoopshype_id, twoK_id, label
Processed 154 lines.


In [12]:
print(ds1.get_record("http://dbpedia.org/resource/Kelly_Oubre_Jr."))

None


In [13]:
i = 0

gt = rltk.GroundTruth()
for row in dev:    
    r1 = ds1.get_record(row[0])
    r2  = ds2.get_record(row[1])
    if r1 and r2:
        if row[-1] == '1':
            gt.add_positive(r1.raw_object['_id'], r2.raw_object['_id'])
        else:
            gt.add_negative(r1.raw_object['_id'], r2.raw_object['_id'])
    else:
        print(i)
        i += 1

rltk.Trial(gt)

0
1
2
3


<rltk.evaluation.trial.Trial at 0x7fb9832bf0a0>

### Blocking

In [14]:
def reduction_ratio(ds1, ds2, block):
    """
    Calculate reduction ratio based on two original datasets and the final blocks
    """
    block_pairs = len(list(rltk.get_record_pairs(ds1, ds2, block=block)))

    ds1_size = len(ds1.generate_dataframe())
    ds2_size = len(ds2.generate_dataframe())

    ratio = float((block_pairs) / (ds1_size * ds2_size))

    print('Total pairs before blocking: %s', ds1_size * ds2_size)
    print('Pairs after blocking: %s', block_pairs)
    print('Reduction Ratio: %s', ratio)
    return ratio

In [15]:
print('--- block on type ---')
bg = rltk.HashBlockGenerator()
block = bg.generate(bg.block(ds1, property_='type'),
                    bg.block(ds2, property_='type'))

print('----------------------')

reduction_ratio(ds1, ds2, block)

--- block on type ---
----------------------
Total pairs before blocking: %s 285180
Pairs after blocking: %s 201480
Reduction Ratio: %s 0.7065011571638965


0.7065011571638965

### Entity Linking

In [16]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.95 # this number is just an example, you need to change it

def title_similarity_1(r1, r2):
    s1 = r1.name_string
    s2 = r2.name_string
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def title_similarity_2(r1, r2):
    s1 = r1.name_string
    s2 = r2.name_string
    
    if s1 == s2:
        return 1
    
    return 0


def type_similarity_2(r1, r2):
    s1 = r1.type
    s2 = r2.type
    
    if s1 == s2:
        return 1
    
    return 0


# entity linkage scoring function
def rule_based_method(r1, r2, A, B, D):
    title_similar = title_similarity_1(r1, r2)
    title_exact = title_similarity_2(r1, r2)
    type_exact = type_similarity_2(r1, r2)
    
    total = A * title_similar + B * title_exact + D * type_exact
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

In [17]:
count = 0

groundtruth = {}
true_matches_compared = 0
matches_compared = 0

for id1, id2, label in gt:
    if label == 1:
        groundtruth[id1] = id2
total_true_matches = len(groundtruth)

with open(dir_ + 'entity_linking.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["hoopshype.ID", "twoK.ID"])

    block_pairs = rltk.get_record_pairs(ds1, ds2)
    for r1, r2 in block_pairs:
        matches_compared += 1
        result, confidence = rule_based_method(r1, r2, A=0.77, B=0.13, D=0.1)

        if r1.id in groundtruth and groundtruth[r1.id] == r2.id:
            true_matches_compared += 1

        if result==1:
            count += 1
            #writer.writerow((r1.id, r2.id, confidence))
            writer.writerow((r1.id, r2.id))


print("Total: " + str(matches_compared))
print("Predicted Valid pair: " + str(count))


Total: 285180
Predicted Valid pair: 391


### Evaluation

In [18]:
def pairs_completeness_and_quality(ds1, ds2, block, gt):
    """
    Calculate pairs completeness and quality using the block & groundtruth provided

    Returns (completeness, quality)
    """

    groundtruth = {}
    true_matches_compared = 0
    matches_compared = 0

    for id1, id2, label in gt:
        if label == 1:
            groundtruth[id1] = id2
    total_true_matches = len(groundtruth)

    for key, id1, id2 in block.pairwise(ds1, ds2):
        matches_compared += 1
        if id1 in groundtruth and groundtruth[id1] == id2:
            true_matches_compared += 1

    # Recall
    completeness = float(true_matches_compared) / total_true_matches
    print('Pairs Completeness = %s / %s  = %s' %(true_matches_compared, total_true_matches, completeness))

    # Precision
    quality = float(true_matches_compared) / matches_compared
    print('Pairs quality = %s / %s = %s' %(true_matches_compared, matches_compared, quality))

    return (completeness, quality)

In [19]:
(_,_) = pairs_completeness_and_quality(ds1, ds2, block, gt)

Pairs Completeness = 50 / 50  = 1.0
Pairs quality = 50 / 201480 = 0.00024816358943815766


In [20]:
count = 0

trial = rltk.Trial(gt)
candidate_pairs = rltk.get_record_pairs(ds1, ds2, ground_truth=gt)
for r1, r2 in candidate_pairs:
    count += 1
    result, confidence = rule_based_method(r1, r2, A=0.77, B=0.13, D=0.1)
    trial.add_result(r1, r2, result, confidence)

print('---------------')
print('Total pair compared: %s' % count)
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

print('---------------')
print('F-score: %s' % trial.f_measure)

---------------
Total pair compared: 150
Trial statistics based on Ground-Truth from development set data:
tp: 0.880000 [44]
fp: 0.000000 [0]
tn: 1.000000 [100]
fn: 0.120000 [6]
---------------
F-score: 0.9361702127659575
