## Record Linkage Unsupervised Professionisti

In [1]:
!pip install recordlinkage --quiet
!pip install sklearn --quiet

In [2]:
import numpy as np 
import pandas as pd
import sklearn
import recordlinkage
import warnings
from recordlinkage.index import Full
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
professionisti = pd.read_csv("alignedSchemas/professionistiAligned.csv")
professionisti = professionisti.drop('Unnamed: 0', 1)
professionisti.head(10)

Unnamed: 0,name,specialization,spokenLanguages,address,rating,yearsOfExperience,website,gender,price,telephone,email,id
0,SUSAN ELLER,FAMILY NURSE PRACTITIONER,ENGLISH,"8469 E MCDONALD DR SCOTTSDALE, AZ 85250",,,,,,,,1
1,DR. ELLE ROSS,"INTERNIST, PRIMARY CARE DOCTOR",ENGLISH,"5421 W THUNDERBIRD RD GLENDALE, AZ 85306",,,,,,,,2
2,JESSICA RASSAS,NURSE PRACTITIONER,ENGLISH,"6730 EAST MCDOWELL ROAD SCOTTSDALE, AZ 85257",,,,,,,,3
3,DR. RAUL MEDINA,"FAMILY PHYSICIAN, PRIMARY CARE DOCTOR",ENGLISH,"18435 N 19TH AVE PHOENIX, AZ 85023",,,,,,,,4
4,KRISTINA MATTSON,"NURSE PRACTITIONER, FAMILY NURSE PRACTITIONER",ENGLISH,"3530 S VAL VISTA DR GILBERT, AZ 85297",,,,,,,,5
5,DR. PARWIN SADAAT,"FAMILY PHYSICIAN, PRIMARY CARE DOCTOR",ENGLISH,"8469 E MCDONALD DR SCOTTSDALE, AZ 85250",,,,,,,,6
6,DR. ANTHONY AGHENTA,"INTERNIST, PRIMARY CARE DOCTOR",ENGLISH,"1434 W ELLIOT RD GILBERT, AZ 85233",,,,,,,,7
7,DEANNA WRIGHT,"NURSE PRACTITIONER, FAMILY NURSE PRACTITIONER",ENGLISH,"10210 NORTH 92ND ST SCOTTSDALE, AZ 85258",,,,,,,,8
8,ALTA LANGDON,"NURSE PRACTITIONER, ADULT PSYCHIATRIC & MENTAL...",ENGLISH,"14301 N 87TH ST SCOTTSDALE, AZ 85260",,,,,,,,9
9,DR. BRUCE HERMAN,"INTERNIST, PRIMARY CARE DOCTOR",ENGLISH,"5620 W THUNDERBIRD RD GLENDALE, AZ 85306",,,,,,,,10


In [4]:
indexer = recordlinkage.Index()
indexer.block('name')
pairs = indexer.index(professionisti, professionisti)

In [5]:
print(len(pairs))

235803


In [6]:
train_pairs = pairs[:12000]
test_pairs = pairs[12000:]

## Training Unsupervised

In [7]:
compare = recordlinkage.Compare()

compare.string('name', 'name', method='jarowinkler', label="name", threshold=0.80)
compare.string('specialization', 'specialization', method='levenshtein', label="specialization", threshold=0.80)
compare.exact('id', 'id', label="id")
compare.string('address', 'address', method='levenshtein', label="address", threshold=0.85)
compare.string('website', 'website', method='levenshtein', label="website", threshold=0.95)
compare.string('telephone', 'telephone', method='levenshtein', label="telephone", threshold=0.90)
compare.string('email', 'email', method='levenshtein', label="email", threshold=0.90)

training_features = compare.compute(train_pairs, professionisti, professionisti)
#training_features['score'] = training_features.loc[:, 'name':'email'].sum(axis=1)

In [8]:
training_matches = training_features[training_features.sum(axis=1) > 3].reset_index()
training_matches['score'] = training_matches.loc[:, 'name':'email'].sum(axis=1)
toDrop = ['name', 'specialization', 'address', 'address', 'website', 'telephone', 'email', 'score', 'id']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,0,0
1,37,37
2,1,1
3,2,2
4,3,3


In [9]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion

In [10]:
training_features.head()

Unnamed: 0,Unnamed: 1,name,specialization,id,address,website,telephone,email
0,0,1.0,1.0,1,1.0,0.0,0.0,0.0
0,37,1.0,1.0,0,1.0,0.0,0.0,0.0
37,0,1.0,1.0,0,1.0,0.0,0.0,0.0
37,37,1.0,1.0,1,1.0,0.0,0.0,0.0
1,1,1.0,1.0,1,1.0,0.0,0.0,0.0


In [11]:
len(training_features)

12000

In [12]:
len(training_matches)

5985

#### Classifier

In [13]:
classifier = recordlinkage.KMeansClassifier()
classifier.fit(training_features, training_matches)

## Testing And Evaluation 

In [None]:
compare = recordlinkage.Compare()

compare.string('name', 'name', method='jarowinkler', label="name", threshold=0.80)
compare.string('specialization', 'specialization', method='levenshtein', label="specialization", threshold=0.80)
compare.exact('id', 'id', label="id")
compare.string('address', 'address', method='levenshtein', label="address", threshold=0.85)
compare.string('website', 'website', method='levenshtein', label="website", threshold=0.95)
compare.string('telephone', 'telephone', method='levenshtein', label="telephone", threshold=0.90)
compare.string('email', 'email', method='levenshtein', label="email", threshold=0.90)

test_features = compare.compute(test_pairs, professionisti, professionisti)
#test_features['score'] = test_features.loc[:, 'name':'email'].sum(axis=1)

In [None]:
test_matches = test_features[test_features.sum(axis=1) > 3].reset_index()
test_matches['score'] = test_matches.loc[:, 'name':'email'].sum(axis=1)
toDrop = ['name', 'specialization', 'address', 'address', 'website', 'telephone', 'email', 'score', 'id']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head()

In [None]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [None]:
predictions = classifier.predict(test_features)

In [None]:
print(len(test_features))

In [None]:
print(len(test_matches))

In [None]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)