## Record Linkage Companies

In [1]:
!pip install recordlinkage --quiet
!pip install sklearn --quiet

In [2]:
import numpy as np 
import pandas as pd
import sklearn
import recordlinkage
import warnings
from recordlinkage.index import Full
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
companies = pd.read_csv("alignedSchemas/companiesAligned.csv")
companies = companies.drop('Unnamed: 0', 1)
companies.head(10)

Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
0,GROUPON,"600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616","1,001 TO 5,000",INFORMATION TECHNOLOGY,,,,,,,1
1,E-TECHNOLOGIES,AUCKLAND,2 TO 10,INFORMATION TECHNOLOGY,,,,,,,2
2,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,,3
3,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,,4
4,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,,5
5,SISU,LOS ANGELES,11 TO 50,RESTAURANTS & CAFES,,,,,,,6
6,TSMC,"SAN JOSE, CA",51 TO 200,MANUFACTURING,,,,,,,7
7,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,,,,8
8,IVYTECH SOLUTIONS INC,,,,,,,,,,9
9,VERTIV,"1050 DEARBORN DRIVE, COLUMBUS, OH 43085, UNITE...","10,000+",TELECOMMUNICATIONS,,,,,,,10


In [4]:
first_indexer = recordlinkage.Index()
first_indexer.block('name')
second_indexer = recordlinkage.Index()
second_indexer.block('headquarter')
third_indexer = recordlinkage.Index()
third_indexer.block('ceo')
fourth_indexer = recordlinkage.Index()
fourth_indexer.block('ticker')

first_pairs = first_indexer.index(companies, companies)
second_pairs = second_indexer.index(companies, companies)
third_pairs = third_indexer.index(companies, companies)
fourth_pairs = fourth_indexer.index(companies, companies)

first_frame = pd.MultiIndex.to_frame(first_pairs)
second_frame = pd.MultiIndex.to_frame(second_pairs)
third_frame = pd.MultiIndex.to_frame(third_pairs)
fourth_frame = pd.MultiIndex.to_frame(fourth_pairs)

pairs = pd.concat([first_frame, second_frame, third_frame, fourth_frame])#, third_frame, fourth_frame])
pairs = pd.MultiIndex.from_frame(pairs)

In [5]:
print(pairs)

MultiIndex([(    0,     0),
            (    0, 20812),
            (    0, 44061),
            (    0, 62588),
            (20812,     0),
            (20812, 20812),
            (20812, 44061),
            (20812, 62588),
            (44061,     0),
            (44061, 20812),
            ...
            ( 6459,  6459),
            ( 6460,  6460),
            ( 6461,  6461),
            ( 6462,  6462),
            ( 6463,  6463),
            ( 6464,  6464),
            ( 6465,  6465),
            ( 6466,  6466),
            ( 6467,  6467),
            ( 6468,  6468)],
           names=[0, 1], length=287378731)


In [6]:
print(pairs[523])

(37562, 6480)


In [7]:
train_pairs = pairs[:85000]
test_pairs = pairs[85000:]

In [8]:
companies.loc[0]

name                                                 GROUPON
headquarter    600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616
employees                                     1,001 TO 5,000
industry                              INFORMATION TECHNOLOGY
website                                                  NaN
ticker                                                   NaN
ceo                                                      NaN
revenue_M                                                NaN
marketcap_M                                              NaN
shareprice                                               NaN
id                                                         1
Name: 0, dtype: object

In [9]:
companies.loc[0]

name                                                 GROUPON
headquarter    600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616
employees                                     1,001 TO 5,000
industry                              INFORMATION TECHNOLOGY
website                                                  NaN
ticker                                                   NaN
ceo                                                      NaN
revenue_M                                                NaN
marketcap_M                                              NaN
shareprice                                               NaN
id                                                         1
Name: 0, dtype: object

In [10]:
result = companies['name'] == "GROUPON"

In [11]:
(companies.name == 'MICROSOFT').sum()

3

In [12]:
unique = companies["name"].unique()
np.size(unique)

35751

## Training 

In [13]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

training_features = compare.compute(train_pairs, companies, companies)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)

In [14]:
training_matches = training_features[training_features.sum(axis=1) > 2].reset_index()
training_matches['score'] = training_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,0,1
0,0,0
1,44061,44061
2,1,1
3,2,2
4,3,3


In [15]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion

In [16]:
training_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,employees,website,ticker,ceo,score
0,1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1.0,1,0.0,0.0,0.0,2.0
0,20812,1.0,0,0.0,0.0,0.0,1.0
0,44061,1.0,0,0.0,0.0,0.0,1.0
0,62588,1.0,0,0.0,0.0,0.0,1.0
20812,0,1.0,0,0.0,0.0,0.0,1.0


In [17]:
len(training_features)

85000

In [18]:
len(training_matches)

48324

#### Classifier

In [19]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing And Evaluation

In [None]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

test_features = compare.compute(test_pairs, companies, companies)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)

In [None]:
test_matches = test_features[test_features.sum(axis=1) > 2].reset_index()
test_matches['score'] = test_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head(10)

In [None]:
len(test_features)

In [None]:
len(test_matches)

In [None]:
test_matches

In [None]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [None]:
predictions = classifier.predict(test_features)

In [None]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)

In [None]:
print(len(predictions))

In [None]:
false_negatives = test_matches.difference(predictions)
false_negatives

In [None]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(companies[companies.index == fn_from_dfA])
    display(companies[companies.index == fn_from_dfB])
except:
    print("No False Negatives Present")

In [None]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(companies[companies.index == fn_from_dfA])
    display(companies[companies.index == fn_from_dfB])
except: 
    print("No False Negatives Present")