## Record Linkage Companies

In [1]:
!pip install recordlinkage --quiet
!pip install sklearn --quiet

In [2]:
import numpy as np 
import pandas as pd
import sklearn
import recordlinkage
import warnings
from recordlinkage.index import Full
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
companies = pd.read_csv("alignedSchemas/companiesAligned.csv")
companies = companies.drop('Unnamed: 0', 1)
companies.head(10)

Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
0,GROUPON,"600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616","1,001 TO 5,000",INFORMATION TECHNOLOGY,,,,,,,1
1,E-TECHNOLOGIES,AUCKLAND,2 TO 10,INFORMATION TECHNOLOGY,,,,,,,2
2,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,,3
3,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,,4
4,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,,5
5,SISU,LOS ANGELES,11 TO 50,RESTAURANTS & CAFES,,,,,,,6
6,TSMC,"SAN JOSE, CA",51 TO 200,MANUFACTURING,,,,,,,7
7,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,,,,8
8,IVYTECH SOLUTIONS INC,,,,,,,,,,9
9,VERTIV,"1050 DEARBORN DRIVE, COLUMBUS, OH 43085, UNITE...","10,000+",TELECOMMUNICATIONS,,,,,,,10


In [4]:
indexer = recordlinkage.Index()
indexer.block('name')
pairs = indexer.index(companies, companies)

In [5]:
print(len(pairs))

178227


In [6]:
train_pairs = pairs[:1000]
test_pairs = pairs[1000:]

## Training 

In [7]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

training_features = compare.compute(train_pairs, companies, companies)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)

In [8]:
training_matches = training_features[training_features.sum(axis=1) > 2].reset_index()
training_matches['score'] = training_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,0,0
1,44061,44061
2,1,1
3,2,2
4,3,3


In [9]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion

In [10]:
training_features.head()

Unnamed: 0,Unnamed: 1,name,employees,website,ticker,ceo,score
0,0,1.0,1,0.0,0.0,0.0,2.0
0,20812,1.0,0,0.0,0.0,0.0,1.0
0,44061,1.0,0,0.0,0.0,0.0,1.0
0,62588,1.0,0,0.0,0.0,0.0,1.0
20812,0,1.0,0,0.0,0.0,0.0,1.0


In [11]:
len(training_features)

1000

In [12]:
len(training_matches)

414

#### Classifier

In [13]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing And Evaluation

In [14]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

test_features = compare.compute(test_pairs, companies, companies)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)

In [15]:
test_matches = test_features[test_features.sum(axis=1) > 2].reset_index()
test_matches['score'] = test_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head(10)

Unnamed: 0,level_0,level_1
0,46776,46776
1,159,159
2,160,160
3,161,161
4,162,162
5,7500,7500
6,7500,22354
7,7500,38611
8,22354,7500
9,22354,22354


In [16]:
len(test_features)

177227

In [17]:
len(test_matches)

81624

In [18]:
test_matches

Unnamed: 0,level_0,level_1
0,46776,46776
1,159,159
2,160,160
3,161,161
4,162,162
...,...,...
81619,64598,64598
81620,64601,64601
81621,64602,64602
81622,64606,64606


In [19]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [20]:
predictions = classifier.predict(test_features)

In [21]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)

confusion matrix
[[81624     0]
 [    0 95603]]


fscore 1.0
recall 1.0
precision 1.0
accuracy 1.0


In [22]:
print(len(predictions))

81624


In [23]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([], )

In [24]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(companies[companies.index == fn_from_dfA])
    display(companies[companies.index == fn_from_dfB])
except:
    print("No False Negatives Present")

No False Negatives Present


In [25]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(companies[companies.index == fn_from_dfA])
    display(companies[companies.index == fn_from_dfB])
except: 
    print("No False Negatives Present")

No False Negatives Present
