### Loading Libraries

In [1]:
%pip install Levenshtein
import numpy as np
import pandas as pd
import Levenshtein as lev
from itertools import product

Collecting Levenshtein
  Downloading Levenshtein-0.20.8-cp310-cp310-win_amd64.whl (100 kB)
     -------------------------------------- 100.5/100.5 kB 5.6 MB/s eta 0:00:00
Collecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.0-cp310-cp310-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 16.5 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.20.8 rapidfuzz-2.13.0
Note: you may need to restart the kernel to use updated packages.


### Loading Data

In [10]:
sevMil = pd.read_csv("companies_sorted-1.csv")
ft = pd.read_csv("Forbes_top 2000_2022.csv")
db = pd.read_csv("mapping/dbpedia/data/dbpedia_preprocessed_final.csv")

### Creating ID Column for Each Dataset

In [12]:
sevMil = sevMil[['name']]
sevMil = sevMil.applymap(lambda s: s.lower() if type(s) == str else s)
sevMil['id'] =  range(len(sevMil))
sevMil['id'] = "7.1M_" + sevMil['id'].astype(str)

In [13]:
ft = ft[['organizationName']]
ft = ft.applymap(lambda s: s.lower() if type(s) == str else s)
ft['id'] =  range(len(ft))
ft['id'] = "ft_" + ft['id'].astype(str)

In [14]:
dbpedia = db[['name']]
dbpedia = dbpedia.applymap(lambda s: s.lower() if type(s) == str else s)
dbpedia['id'] =  range(len(db))
dbpedia['id'] = "db_" + dbpedia['id'].astype(str)

### Calculating Levenshtein Distance for Gold Standard

#### Between FT and DBPedia Datasets

In [15]:
# Calculate the distance (ratio)
ft_db_lev = pd.DataFrame(product(ft['organizationName'], dbpedia['name']), columns=["ft","dbpedia"])
ft_db_lev["LevScore"] = ft_db_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_db_lev_filtered = ft_db_lev[ft_db_lev['LevScore'] > 0.8]

# Identify matching and similiar-but-unmatching pairs 
confirmed_true_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] == 1]
edge_cases_ftdb = ft_db_lev_filtered[ft_db_lev_filtered['LevScore'] != 1]

# Sample 200 for output
true_sample = confirmed_true_ftdb.sample(200, random_state=1)
false_sample = edge_cases_ftdb.sample(200, random_state=1)

# Goldstandard output
gold_standard_ft_db = pd.concat([true_sample,false_sample])
gold_standard_ft_db = gold_standard_ft_db.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ft_db.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db.merge(dbpedia, how = 'left', left_on = 'dbpedia', right_on = 'name')
gold_standard_ft_db.rename(columns={'id':'db_id'}, inplace = True)
gold_standard_ft_db = gold_standard_ft_db[['ft_id', 'db_id', 'LevScore', 'ft', 'dbpedia']]
gold_standard_ft_db['label'] = gold_standard_ft_db.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1) 
gold_standard_ft_db

Unnamed: 0,ft_id,db_id,LevScore,ft,dbpedia,label
0,ft_224,db_1194,1.000000,société générale,société générale,True
1,ft_1710,db_2189,1.000000,aramark,aramark,True
2,ft_1050,db_3027,1.000000,bankinter,bankinter,True
3,ft_1194,db_811,1.000000,suning.com,suning.com,True
4,ft_1194,db_3352,1.000000,suning.com,suning.com,True
...,...,...,...,...,...,...
481,ft_360,db_6408,0.820513,wilmar international,tower international,False
482,ft_47,db_1274,0.857143,axa group,maxima group,False
483,ft_767,db_1218,0.844444,china reinsurance group,vienna insurance group,False
484,ft_1163,db_3412,0.896552,cj corporation,ncr corporation,False


In [16]:
gold_standard_ft_db[gold_standard_ft_db["LevScore"] > 0.9]

Unnamed: 0,ft_id,db_id,LevScore,ft,dbpedia,label
0,ft_224,db_1194,1.000000,société générale,société générale,True
1,ft_1710,db_2189,1.000000,aramark,aramark,True
2,ft_1050,db_3027,1.000000,bankinter,bankinter,True
3,ft_1194,db_811,1.000000,suning.com,suning.com,True
4,ft_1194,db_3352,1.000000,suning.com,suning.com,True
...,...,...,...,...,...,...
388,ft_1052,db_299,0.962963,c.h. robinson,c. h. robinson,False
409,ft_1436,db_1076,0.903226,thor industries,toray industries,False
410,ft_1436,db_4846,0.903226,thor industries,toray industries,False
423,ft_1980,db_5134,0.918919,wt microelectronics,stmicroelectronics,False


#### Between FT and 7Mil datasets

In [17]:
# Calculate the distance (ratio)
ft_sevM_lev = pd.DataFrame(product(ft['organizationName'], sevMil['name']), columns=["ft","sevMil"])
ft_sevM_lev["LevScore"] = ft_sevM_lev.apply(lambda x: lev.ratio(x[0],x[1]), axis=1)

# Filter similiar pairs 
ft_sevM_lev_filtered = ft_sevM_lev[ft_sevM_lev['LevScore'] > 0.8]

# Identify matching and similiar-but-unmatching pairs
confirmed_true_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] == 1]
edge_cases_ftsevM = ft_sevM_lev_filtered[ft_sevM_lev_filtered['LevScore'] != 1]

# Sample 200 for output
# true_sample = confirmed_true_ftsevM.sample(200)
true_sample = confirmed_true_ftsevM.sample(200, random_state=1)
false_sample = edge_cases_ftsevM.sample(200, random_state=1)

# Goldstandard output
gold_standard_ftsevM = pd.concat([true_sample,false_sample])
gold_standard_ftsevM = gold_standard_ftsevM.merge(ft, how='left', left_on='ft', right_on='organizationName')
gold_standard_ftsevM.rename(columns={'id':'ft_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM.merge(sevMil, how = 'left', left_on = 'sevMil', right_on = 'name')
gold_standard_ftsevM.rename(columns={'id':'sevM_id'}, inplace = True)
gold_standard_ftsevM = gold_standard_ftsevM[['ft_id', 'sevM_id', 'LevScore', 'ft', 'sevMil']]
gold_standard_ftsevM['label'] = gold_standard_ftsevM.apply(lambda x: True if x['LevScore'] == 1 else False, axis = 1)
gold_standard_ftsevM

Unnamed: 0,ft_id,sevM_id,LevScore,ft,sevMil,label
0,ft_190,7.1M_732,1.000000,micron technology,micron technology,True
1,ft_127,7.1M_508,1.000000,conocophillips,conocophillips,True
2,ft_998,7.1M_1623,1.000000,molina healthcare,molina healthcare,True
3,ft_1221,7.1M_1627,1.000000,chipotle mexican grill,chipotle mexican grill,True
4,ft_547,7.1M_1242,1.000000,analog devices,analog devices,True
...,...,...,...,...,...,...
399,ft_552,7.1M_111,0.857143,aon,avon,False
400,ft_360,7.1M_130,0.809524,wilmar international,marriott international,False
401,ft_1163,7.1M_330,0.812500,cj corporation,cerner corporation,False
402,ft_1344,7.1M_240,0.812500,compal electronics,lg electronics,False


In [18]:
gold_standard_ftsevM[gold_standard_ftsevM["LevScore"] > 0.9]

Unnamed: 0,ft_id,sevM_id,LevScore,ft,sevMil,label
0,ft_190,7.1M_732,1.000000,micron technology,micron technology,True
1,ft_127,7.1M_508,1.000000,conocophillips,conocophillips,True
2,ft_998,7.1M_1623,1.000000,molina healthcare,molina healthcare,True
3,ft_1221,7.1M_1627,1.000000,chipotle mexican grill,chipotle mexican grill,True
4,ft_547,7.1M_1242,1.000000,analog devices,analog devices,True
...,...,...,...,...,...,...
377,ft_359,7.1M_762,0.904762,kraft heinz company,the kraft heinz company,False
379,ft_1051,7.1M_506,0.974359,teva pharmaceutical,teva pharmaceuticals,False
384,ft_858,7.1M_1633,0.960000,t rowe price,t. rowe price,False
393,ft_501,7.1M_431,0.937500,sherwin-williams,sherwin williams,False


Printing Output

In [None]:
gold_standard_ftsevM.drop(['LevScore','ft','sevMil'], axis=1, inplace=True) 
gold_standard_ft_db.drop(['LevScore','ft','dbpedia'], axis=1, inplace=True)

In [21]:
gold_standard_ft_db.to_csv('data/goldstandard/GS_ft_db.csv', index=False) 
gold_standard_ftsevM.to_csv('data/goldstandard/GS_ft_sevM.csv', index=False)