### Loading Libraries

In [1]:
%pip install Levenshtein
import numpy as np
import pandas as pd
import Levenshtein as lev
from itertools import product

Note: you may need to restart the kernel to use updated packages.


### Loading Data

In [2]:
sevMil = pd.read_csv("companies_sorted-1.csv")
ft = pd.read_csv("Forbes_top 2000_2022.csv")
db = pd.read_csv("mapping/dbpedia/data/dbpedia_preprocessed_final.csv")

### Creating ID Column for Each Dataset

In [3]:
sevMil = sevMil[['name']]
sevMil = sevMil.applymap(lambda s: s.lower() if type(s) == str else s)
sevMil['id'] =  range(len(sevMil))
sevMil['id'] = "7.1m_" + sevMil['id'].astype(str)

In [4]:
ft = ft[['organizationName']]
ft = ft.applymap(lambda s: s.lower() if type(s) == str else s)
ft['id'] =  range(len(ft))
ft['id'] = "ft_id_" + ft['id'].astype(str)

In [5]:
dbpedia = db[['name']]
dbpedia = dbpedia.applymap(lambda s: s.lower() if type(s) == str else s)
dbpedia['id'] =  range(len(db))
dbpedia['id'] = "dbpedia_" + dbpedia['id'].astype(str)

### Calculating Levenshtein Distance for Gold Standard

#### Between FT and DBPedia Datasets

In [None]:
# Calculate the distance (ratio)
ft_db_lev = pd.DataFrame(product(ft['organizationName'], dbpedia['name']), columns=["ft","dbpedia"])
# Since there are duplicate name, plan to deal with it in Java
ft_db_lev = ft_db_lev.drop_duplicates()
ft_db_lev["LevScore"] = ft_db_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_db_lev_filtered = ft_db_lev[ft_db_lev['LevScore'] > 0.8]
ft_db_lev_false = ft_db_lev[ft_db_lev['LevScore'] < 0.3]

# Identify matching and similiar-but-unmatching pairs 
confirmed_true_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] == 1]
edge_cases_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] != 1]
false_cases_ftdb = ft_db_lev_false

# Sample 100 for matching cases, 150 for corner cases, 250 for non-matching cases
true_sample = confirmed_true_ftdb.sample(100, random_state=1)
edge_sample = edge_cases_ftdb.sample(150, random_state=1)
false_sample = false_cases_ftdb.sample(250, random_state=1)

# Goldstandard output
gold_standard_ft_db = pd.concat([true_sample, edge_sample, false_sample])
gold_standard_ft_db = gold_standard_ft_db.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ft_db.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db.merge(dbpedia, how = 'left', left_on = 'dbpedia', right_on = 'name')
gold_standard_ft_db.rename(columns={'id':'db_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db[['ft_id', 'db_id', 'LevScore', 'ft', 'dbpedia']]
gold_standard_ft_db['label'] = gold_standard_ft_db.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_ft_db = gold_standard_ft_db.drop_duplicates(subset=['ft', 'dbpedia'], keep='first')
gold_standard_ft_db

In [None]:
gold_standard_ft_db.describe(include = 'all')

#### Between FT and 7Mil datasets

In [None]:
# Calculate the distance (ratio)
ft_sevM_lev = pd.DataFrame(product(ft['organizationName'], sevMil['name']), columns=["ft","sevMil"])
# Since there are duplicate name, plan to deal with it in Java
ft_sevM_lev = ft_sevM_lev.drop_duplicates()
ft_sevM_lev["LevScore"] = ft_sevM_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_sevM_lev_filtered = ft_sevM_lev[ft_sevM_lev['LevScore'] > 0.8]
ft_sevM_lev_false = ft_sevM_lev[ft_sevM_lev['LevScore'] < 0.3]

# Identify matching and similiar-but-unmatching pairs
confirmed_true_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] == 1]
edge_cases_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] != 1]
false_cases_ftsevM = ft_sevM_lev_false

# Sample 100 for matching cases, 150 for corner cases, 250 for non-matching cases
true_sample = confirmed_true_ftsevM.sample(100, random_state=1)
edge_sample = edge_cases_ftsevM.sample(150, random_state=1)
false_sample = false_cases_ftsevM.sample(250, random_state=1)

# Goldstandard output
gold_standard_ftsevM = pd.concat([true_sample, edge_sample, false_sample])
gold_standard_ftsevM = gold_standard_ftsevM.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ftsevM.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM.merge(sevMil, how = 'left', left_on = 'sevMil', right_on = 'name')
gold_standard_ftsevM.rename(columns={'id':'sevM_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM[['ft_id', 'sevM_id', 'LevScore', 'ft', 'sevMil']]
gold_standard_ftsevM['label'] = gold_standard_ftsevM.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_ftsevM = gold_standard_ftsevM.drop_duplicates(subset=['ft', 'sevMil'], keep='first')
gold_standard_ftsevM

In [None]:
gold_standard_ftsevM.describe(include = 'all')

Printing data for manually check

In [None]:
gold_standard_ft_db.sort_values('LevScore', ascending = False).to_csv('ft_db_check.csv', index=False)
gold_standard_ftsevM.sort_values('LevScore', ascending = False).to_csv('ft_sevM_check.csv', index=False)

Printing Output

#### Between db and 7Mil datasets

In [8]:
# Calculate the distance (ratio)
db_sevm_lev = pd.DataFrame(product(dbpedia['name'], sevMil['name']), columns=["db","sevMil"])
# Since there are duplicate name, plan to deal with it in Java
db_sevm_lev = db_sevm_lev.drop_duplicates()
db_sevm_lev["LevScore"] = db_sevm_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
db_sevm_lev_filtered = db_sevm_lev[db_sevm_lev['LevScore'] > 0.8]
db_sevm_lev_false = db_sevm_lev[db_sevm_lev['LevScore'] < 0.3]

# Identify matching and similiar-but-unmatching pairs 
confirmed_true_dbsevm = db_sevm_lev_filtered[db_sevm_lev_filtered['LevScore'] == 1]
edge_cases_dbsevm = db_sevm_lev_filtered[db_sevm_lev_filtered['LevScore'] != 1]
false_cases_dbsevm = db_sevm_lev_false

# Sample 100 for matching cases, 150 for corner cases, 250 for non-matching cases
true_sample = confirmed_true_dbsevm.sample(100, random_state=1)
edge_sample = edge_cases_dbsevm.sample(150, random_state=1)
false_sample = false_cases_dbsevm.sample(250, random_state=1)

# Goldstandard output
gold_standard_dbsevm = pd.concat([true_sample, edge_sample, false_sample])
gold_standard_dbsevm = gold_standard_dbsevm.merge(dbpedia, how='left', left_on='db', right_on='name')
gold_standard_dbsevm.rename(columns={'id':'db_id'}, inplace = True)
gold_standard_dbsevm = gold_standard_dbsevm.merge(sevMil, how = 'left', left_on = 'sevMil', right_on = 'name')
gold_standard_dbsevm.rename(columns={'id':'sevM_id'}, inplace = True)
gold_standard_dbsevm = gold_standard_dbsevm[['db_id', 'sevM_id', 'LevScore', 'db', 'sevMil']]
gold_standard_dbsevm['label'] = gold_standard_dbsevm.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_dbsevm = gold_standard_dbsevm.drop_duplicates(subset=['db', 'sevMil'], keep='first')
gold_standard_dbsevm

Unnamed: 0,db_id,sevM_id,LevScore,db,sevMil,label
0,dbpedia_1017,7.1m_151,1.000000,the coca-cola company,the coca-cola company,True
1,dbpedia_2240,7.1m_1732,1.000000,celestica,celestica,True
2,dbpedia_4796,7.1m_1507,1.000000,american electric power,american electric power,True
3,dbpedia_3646,7.1m_1257,1.000000,family dollar,family dollar,True
4,dbpedia_3281,7.1m_1736,1.000000,safran aircraft engines,safran aircraft engines,True
...,...,...,...,...,...,...
558,dbpedia_1410,7.1m_1541,0.216216,abu dhabi national oil company,chili's,False
559,dbpedia_6758,7.1m_1377,0.256410,tide (transportation company),lego group,False
560,dbpedia_5584,7.1m_1620,0.133333,o boticário,sars,False
561,dbpedia_751,7.1m_455,0.100000,siderperu,tata motors,False


In [10]:
gold_standard_dbsevm.describe(include = 'all')

Unnamed: 0,db_id,sevM_id,LevScore,db,sevMil,label
count,500,500,500.0,500,500,500
unique,465,383,,465,383,2
top,dbpedia_6049,7.1m_1699,,lsc communications,sabre corporation,False
freq,3,9,,3,9,400
mean,,,0.550459,,,
std,,,0.360218,,,
min,,,0.0,,,
25%,,,0.213346,,,
50%,,,0.552403,,,
75%,,,0.858766,,,


In [11]:
gold_standard_dbsevm.sort_values('LevScore', ascending = False).to_csv('db_sevM_check.csv', index=False)

In [None]:
gold_standard_ftsevM.drop(['LevScore','ft','sevMil'], axis=1, inplace=True) 
gold_standard_ft_db.drop(['LevScore','ft','dbpedia'], axis=1, inplace=True)

In [None]:
gold_standard_ft_db.to_csv('data/goldstandard/GS_ft_db.csv', index=False) 
gold_standard_ftsevM.to_csv('data/goldstandard/GS_ft_sevM.csv', index=False)