### Loading Libraries

In [None]:
# %pip install Levenshtein
import numpy as np
import pandas as pd
import Levenshtein as lev
from itertools import product

### Loading Data

In [43]:
sevMil = pd.read_csv("companies_sorted-1.csv")
ft = pd.read_csv("Forbes_top 2000_2022.csv")
db = pd.read_csv("sparql_2022-10-02_13-24-07Z.csv")

### Creating ID Column for Each Dataset

In [68]:
sevMil = sevMil[['name']]
sevMil = sevMil.applymap(lambda s: s.lower() if type(s) == str else s)
sevMil['id'] =  range(len(sevMil))
sevMil['id'] = "7.1M_" + sevMil['id'].astype(str)

In [69]:
ft = ft[['organizationName']]
ft = ft.applymap(lambda s: s.lower() if type(s) == str else s)
ft['id'] =  range(len(ft))
ft['id'] = "ft_" + ft['id'].astype(str)

In [65]:
dbpedia = db[['name']]
dbpedia = dbpedia.applymap(lambda s: s.lower() if type(s) == str else s)
dbpedia['id'] =  range(len(db))
dbpedia['id'] = "db_" + dbpedia['id'].astype(str)

### Calculating Levenshtein Distance for Gold Standard

#### Between FT and DBPedia Datasets

In [70]:
# Calculate the distance (ratio)
ft_db_lev = pd.DataFrame(product(ft['organizationName'], dbpedia['name']), columns=["ft","dbpedia"])
ft_db_lev["LevScore"] = ft_db_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_db_lev_filtered = ft_db_lev[ft_db_lev['LevScore'] > 0.8]

# Identify matching and similiar-but-unmatching pairs 
confirmed_true_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] == 1]
edge_cases_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] != 1]

# Sample 200 for output
true_sample = confirmed_true_ftdb.sample(200)
false_sample = edge_cases_ftdb.sample(200)

# Goldstandard output
gold_standard_ft_db = pd.concat([true_sample,false_sample])
gold_standard_ft_db = gold_standard_ft_db.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ft_db.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db.merge(dbpedia, how = 'left', left_on = 'dbpedia', right_on = 'name')
gold_standard_ft_db.rename(columns={'id':'db_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db[['ft_id', 'db_id', 'LevScore', 'ft', 'dbpedia']]
gold_standard_ft_db['label'] = gold_standard_ft_db.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_ft_db

Unnamed: 0,ft_id,db_id,LevScore,ft,dbpedia,label
0,ft_140,db_8386,1.000000,itochu,itochu,True
1,ft_1515,db_4891,1.000000,boston properties,boston properties,True
2,ft_331,db_8167,1.000000,lg chem,lg chem,True
3,ft_861,db_3803,1.000000,roper technologies,roper technologies,True
4,ft_1127,db_7820,1.000000,lululemon athletica,lululemon athletica,True
...,...,...,...,...,...,...
402,ft_663,db_7299,0.808511,mitsubishi chemical,mitsubishi chemical holdings,False
403,ft_574,db_9922,0.823529,lumen technologies,fmc technologies,False
404,ft_1877,db_8560,0.810811,wesco international,olam international,False
405,ft_596,db_9405,0.809524,rogers communications,fisher communications,False


#### Between FT and 7Mil datasets

In [71]:
# Calculate the distance (ratio)
ft_sevM_lev = pd.DataFrame(product(ft['organizationName'], sevMil['name']), columns=["ft","sevMil"])
ft_sevM_lev["LevScore"] = ft_sevM_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_sevM_lev_filtered = ft_sevM_lev[ft_sevM_lev['LevScore'] > 0.8]

# Identify matching and similiar-but-unmatching pairs
confirmed_true_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] == 1]
edge_cases_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] != 1]

# Sample 200 for output
# true_sample = confirmed_true_ftsevM.sample(200)
true_sample = confirmed_true_ftsevM
false_sample = edge_cases_ftsevM.sample(200)

# Goldstandard output
gold_standard_ftsevM = pd.concat([true_sample,false_sample])
gold_standard_ftsevM = gold_standard_ftsevM.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ftsevM.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM.merge(sevMil, how = 'left', left_on = 'sevMil', right_on = 'name')
gold_standard_ftsevM.rename(columns={'id':'sevM_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM[['ft_id', 'sevM_id', 'LevScore', 'ft', 'sevMil']]
gold_standard_ftsevM['label'] = gold_standard_ftsevM.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_ftsevM

Unnamed: 0,ft_id,sevM_id,LevScore,ft,sevMil,label
0,ft_5,7.1M_19,1.000000,amazon,amazon,True
1,ft_6,7.1M_20,1.000000,apple,apple,True
2,ft_8,7.1M_16,1.000000,bank of america,bank of america,True
3,ft_11,7.1M_8,1.000000,microsoft,microsoft,True
4,ft_12,7.1M_1280,1.000000,bank of china,bank of china,True
...,...,...,...,...,...,...
536,ft_1281,7.1M_1794,0.903226,banco de sabadell,banco sabadell,False
537,ft_98,7.1M_86,0.923077,sanofi,sanofi,False
538,ft_1524,7.1M_1280,0.846154,bank of india,bank of china,False
539,ft_1075,7.1M_1859,0.810811,agilent technologies,tata technologies,False
