# Deep Matcher - Record Linkage

In [1]:
!pip install py_entitymatching #--quiet
!pip install deepmatcher #--quiet





In [2]:
import sys
import py_entitymatching as em
import deepmatcher as dm
import os

In [3]:
# Load the two tables.
A = em.read_csv_metadata("alignedSchemas/companiesAligned.csv", key='id')
B = em.read_csv_metadata("alignedSchemas/companiesAligned.csv", key='id')



In [4]:
# Basic information about the tables.
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

Number of tuples in A: 64608
Number of tuples in B: 64608
Number of tuples in A X B (i.e the cartesian product): 4174193664


In [5]:
A.head()

Unnamed: 0.1,Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
0,0,GROUPON,"600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616","1,001 TO 5,000",INFORMATION TECHNOLOGY,,,,,,,1
1,1,E-TECHNOLOGIES,AUCKLAND,2 TO 10,INFORMATION TECHNOLOGY,,,,,,,2
2,2,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,,3
3,3,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,,4
4,4,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,,5


In [6]:
B.head()

Unnamed: 0.1,Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
0,0,GROUPON,"600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616","1,001 TO 5,000",INFORMATION TECHNOLOGY,,,,,,,1
1,1,E-TECHNOLOGIES,AUCKLAND,2 TO 10,INFORMATION TECHNOLOGY,,,,,,,2
2,2,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,,3
3,3,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,,4
4,4,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,,5


In [7]:
# Create an overlap blocker in Magellan and apply it to A and B to get the candidate set K1 which is in the format of 
# a dataframe. The "l_out_attrs" and "r_out_attrs" parameters indicate the columns that will be included in K1 from A
# and B respectively.
ob = em.OverlapBlocker()
K1 = ob.block_tables(A, B, 'name', 'name', 
                     l_output_attrs=['name', 'headquarter', 'employees', 'industry', 'website', 'ticker', 'ceo', 'revenue_M', 'marketcap_M', 'shareprice'], 
                     r_output_attrs=['name', 'headquarter', 'employees', 'industry', 'website', 'ticker', 'ceo', 'revenue_M', 'marketcap_M', 'shareprice'], 
                     overlap_size=2)


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


In [8]:
K1.head()

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_headquarter,ltable_employees,ltable_industry,ltable_website,ltable_ticker,ltable_ceo,...,rtable_name,rtable_headquarter,rtable_employees,rtable_industry,rtable_website,rtable_ticker,rtable_ceo,rtable_revenue_M,rtable_marketcap_M,rtable_shareprice
0,0,3,3,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,...,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,
1,1,4,4,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,...,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,
2,2,5,5,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,...,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,
3,3,8,8,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,...,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,,,
4,4,9,9,IVYTECH SOLUTIONS INC,,,,,,,...,IVYTECH SOLUTIONS INC,,,,,,,,,


In [9]:
len(K1)

3376732

In [10]:
# Create a new overlap blocker to remove pairs from K1 that have no common word in "Artist_Name".
K1 = ob.block_candset(K1, 'employees', 'employees', overlap_size=1)
len(K1)

#compare.string('name', 'name', label="name")
#compare.exact('employees', 'employees', label='employees')
#compare.string('website', 'website', method='jarowinkler', label="website")
#compare.string('ticker', 'ticker', method='jarowinkler', label="ticker")
#compare.string('ceo', 'ceo', method='jarowinkler', label="ceo")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table[overlap_attr] = values
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:13


258433

In [11]:
K1.head()

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_headquarter,ltable_employees,ltable_industry,ltable_website,ltable_ticker,ltable_ceo,...,rtable_name,rtable_headquarter,rtable_employees,rtable_industry,rtable_website,rtable_ticker,rtable_ceo,rtable_revenue_M,rtable_marketcap_M,rtable_shareprice
0,0,3,3,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,...,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,
1,1,4,4,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,...,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,
2,2,5,5,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,...,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,
3,3,8,8,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,...,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,,,
152,152,17,17,CLEVELAND STATE UNIVERSITY,CLEVELAND,"501 TO 1,000",EDUCATION,,,,...,CLEVELAND STATE UNIVERSITY,CLEVELAND,"501 TO 1,000",EDUCATION,,,,,,


In [17]:
# Take a sample of 500 pairs from the candidate set.
sample = em.sample_table(K1, 20)

In [19]:
# Label the sample S in a GUI. Enter 1 for match and 0 for non-match.
gold = em.label_table(sample, 'gold')



AttributeError: 'DataFrame' object has no attribute 'set_value'

In [None]:
result.to_csv("alignedSchemas/gold.csv")