In [1]:
!pip install recordlinkage --quiet
!pip install sklearn --quiet

In [2]:
import numpy as np 
import pandas as pd
import sklearn
import recordlinkage
import warnings
from recordlinkage.index import Full
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
companies = pd.read_csv("alignedSchemas/companiesAligned.csv")
companies = companies.drop('Unnamed: 0', 1)
companies.head(10)

Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
0,GROUPON,"600 W CHICAGO AVE SUITE 400 CHICAGO, IL 60616","1,001 TO 5,000",INFORMATION TECHNOLOGY,,,,,,,1
1,E-TECHNOLOGIES,AUCKLAND,2 TO 10,INFORMATION TECHNOLOGY,,,,,,,2
2,YAKSHNA SOLUTIONS,"HERNDON, VA",11 TO 50,,,,,,,,3
3,THRU TUBING SOLUTIONS,OKLAHOMA CITY,51 TO 200,"ENERGY, MINING & UTILITIES",,,,,,,4
4,CLEVELAND CLIFFS INC,"CLEVELAND, OH","5,001 TO 10,000",PERSONAL CONSUMER SERVICES,,,,,,,5
5,SISU,LOS ANGELES,11 TO 50,RESTAURANTS & CAFES,,,,,,,6
6,TSMC,"SAN JOSE, CA",51 TO 200,MANUFACTURING,,,,,,,7
7,MATHIS BROTHERS FURNITURE,"OKLAHOMA CITY, OKLAHOMA","1,001 TO 5,000",RETAIL & WHOLESALE,,,,,,,8
8,IVYTECH SOLUTIONS INC,,,,,,,,,,9
9,VERTIV,"1050 DEARBORN DRIVE, COLUMBUS, OH 43085, UNITE...","10,000+",TELECOMMUNICATIONS,,,,,,,10


In [4]:
indexer = recordlinkage.Index()
indexer.block('name')
pairs = indexer.index(companies, companies)

In [5]:
print(len(pairs))

178227


In [6]:
train_pairs = pairs[:1000]
test_pairs = pairs[1000:]

## Training 

In [7]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

training_features = compare.compute(train_pairs, companies, companies)

In [8]:
training_matches = training_features[training_features.sum(axis=1) > 2].reset_index()
training_matches['score'] = training_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,44061,44061
1,38308,38308
2,6580,6580
3,21433,21433
4,21433,36906


In [9]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion

In [10]:
training_features.head()

Unnamed: 0,Unnamed: 1,name,employees,website,ticker,ceo
0,0,1.0,1,0.0,0.0,0.0
0,20812,1.0,0,0.0,0.0,0.0
0,44061,1.0,0,0.0,0.0,0.0
0,62588,1.0,0,0.0,0.0,0.0
20812,0,1.0,0,0.0,0.0,0.0


In [11]:
len(training_features)

1000

In [12]:
len(training_matches)

63

#### Classifier

In [13]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing And Evaluation

In [14]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('employees', 'employees', label='employees')
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.85)
compare.string('ticker', 'ticker', method='jarowinkler', label="ticker", threshold=0.95)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

test_features = compare.compute(test_pairs, companies, companies)

In [15]:
test_matches = test_features[test_features.sum(axis=1) > 2].reset_index()
test_matches['score'] = test_matches.loc[:, 'name':'ceo'].sum(axis=1)
toDrop = ['name', 'employees', 'website', 'ticker', 'ceo', 'score']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head(10)

Unnamed: 0,level_0,level_1
0,38611,38611
1,38295,38295
2,38958,38958
3,42115,42115
4,38335,38335
5,38273,38273
6,6986,6986
7,21839,21839
8,21839,36761
9,21839,38068


In [16]:
len(test_features)

177227

In [17]:
len(test_matches)

18871

In [18]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [19]:
predictions = classifier.predict(test_features)

In [20]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)

confusion matrix
[[ 11725   7146]
 [     0 158356]]


fscore 0.7664400575238594
recall 0.62132372423295
precision 1.0
accuracy 0.9596788299751167


In [21]:
print(len(predictions))

11725


In [22]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([( 2105, 40161),
            ( 2108, 42922),
            ( 2109,  2603),
            ( 2109,  4068),
            ( 2113,  3825),
            ( 2116,  2713),
            ( 2117,  2288),
            ( 2117,  3639),
            ( 2117,  3819),
            ( 2117,  4029),
            ...
            (64577, 64577),
            (64580, 64580),
            (64583, 64583),
            (64584, 64584),
            (64585, 64585),
            (64593, 64593),
            (64597, 64597),
            (64600, 64600),
            (64601, 64601),
            (64605, 64605)],
           length=7146)

In [23]:
fn_from_dfA = false_negatives[0][0]
fn_from_dfB = false_negatives[0][1]

display(companies[companies.index == fn_from_dfA])
display(companies[companies.index == fn_from_dfB])

Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
2105,HUNTSMAN CORPORATION,"10003 WOODLOCH FOREST DRIVE THE WOODLANDS, TX ...","5,001 TO 10,000",CHEMICALS,HTTP://WWW.HUNTSMAN.COM,HUN,,,,,2106


Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
40161,HUNTSMAN CORPORATION,USA,"5,001 TO 10,000",BASIC MATERIALS CHEMICALS ...,HTTPS://WWW.HUNTSMAN.COM,,,6.018,5.866,,40162


In [24]:
fn_from_dfA = false_negatives[1][0]
fn_from_dfB = false_negatives[1][1]

display(companies[companies.index == fn_from_dfA])
display(companies[companies.index == fn_from_dfB])

Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
2108,REDFIN CORPORATION,"1099 STEWART STREET SUITE 600 SEATTLE, WA 9810...","1,001 TO 5,000",REAL ESTATE SERVICES,HTTP://WWW.REDFIN.COM,RDFN,,,,,2109


Unnamed: 0,name,headquarter,employees,industry,website,ticker,ceo,revenue_M,marketcap_M,shareprice,id
42922,REDFIN CORPORATION,USA,"1,001 TO 5,000",REAL ESTATE FINANCIAL SERVICES,HTTPS://WWW.REDFIN.COM/,,,886.0,5.143,,42923
