# Complete WorkFlow for Entity Matching

This jupyter notebook loads two tables with around 3000 tuples. Each tuple describe a movie. Goal is to find matching tuples between the two tables.

In [1]:
# Import py_entitymatching package and other required packages
import py_entitymatching as em
import os
import pandas as pd
import math

pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows',1000)

In [2]:
# Get the datasets directory
datasets_dir = 'CleanData'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'Table_IMDB_with_ID.csv'
path_B = datasets_dir + os.sep + 'Table_Allmovie_with_ID.csv'

In [3]:
# Read the CSV files and set 'MID' as the key attribute
A = em.read_csv_metadata(path_A, key='MID')
B = em.read_csv_metadata(path_B, key='MID')

No handlers could be found for logger "py_entitymatching.io.parsers"


In [4]:
DEBUG = False

## First step in the entity matching process is blocking. We apply some rules to perform blocking, while ensuring that positive tuples do not get blocked.


We have used to blocking rules sequentially to perform blocking.

Rule 1: 3 gram Jaccard measure on movie titles

Rule 2: 2 gram Overlapping measure on directors' names

### Applying Blocking Rule 1

In [5]:
#rule based blocking
block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)

In [6]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples with given rule: 
rb.add_rule(['Title_Title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.3'], block_f)

'_rule_0'

In [7]:
#applying the fir
D1 = rb.block_tables(A, B, 
                    l_output_attrs=['Title', 'Certificate', 'Genre','Rating','Running Time', 'Directors','Stars Cast','Country','Release Date','Production Company','Release Year','Release Month'],
                    r_output_attrs=['Title', 'Certificate', 'Genre','Rating','Running Time', 'Directors','Stars Cast','Country','Release Date','Production Company','Release Year','Release Month'],
                
                     l_output_prefix='l_', r_output_prefix='r_',
                    )

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


Finding pairs with missing value...


In [8]:
len(D1)

5210

In [9]:
D1.head()

Unnamed: 0,_id,l_MID,r_MID,l_Title,l_Certificate,l_Genre,l_Rating,l_Running Time,l_Directors,l_Stars Cast,l_Country,l_Release Date,l_Production Company,l_Release Year,l_Release Month,r_Title,r_Certificate,r_Genre,r_Rating,r_Running Time,r_Directors,r_Stars Cast,r_Country,r_Release Date,r_Production Company,r_Release Year,r_Release Month
0,0,2,1,9,PG13,Animation|Action|Adventure,7.1,79 min,Shane Acker,Elijah Wood|Jennifer Connelly|Crispin Glover,USA,9-Sep-09,Focus Features|Relativity Media|Arc Productions,2009.0,September,9,PG13,Action|Science Fiction,8,79 min,Shane Acker,Christopher Plummer|Martin Landau|John C. Reilly|Crispin Glover,USA,9-Sep-09,Focus Features,2009.0,September
1,1,3,2,10,R,Comedy|Romance,6.0,122 min,Blake Edwards,Dudley Moore|Bo Derek|Julie Andrews,USA,5-Oct-79,Geoffrey Productions|Orion Pictures,1979.0,October,10,R,Comedy,8,121 min,Blake Edwards,Dudley Moore|Julie Andrews|Bo Derek|Robert Webber,USA,5-Oct-79,Orion|Warner Brothers,1979.0,October
2,2,20,2,10x10,,Thriller,5.3,88 min,Suzi Ewing,Luke Evans|Kelly Reilly|Noel Clarke,UK,13-Apr-18,Unstoppable Entertainment|Head Gear Films|Metrol Technology,2018.0,April,10,R,Comedy,8,121 min,Blake Edwards,Dudley Moore|Julie Andrews|Bo Derek|Robert Webber,USA,5-Oct-79,Orion|Warner Brothers,1979.0,October
3,3,7,3,1408,PG13,Drama|Horror,6.8,104 min,Mikael Hfstrm,John Cusack|Samuel L. Jackson|Mary McCormack,USA,22-Jun-07,Dimension Films|The Weinstein Company|Di Bonaventura Pictures,2007.0,June,1408,PG13,Horror,7,94 min,Mikael Hfstrm,John Cusack|Samuel L. Jackson|Mary McCormack|Jasmine Jessica Anthony,USA,22-Jun-07,Dimension Films|Lorenzo di Bonaventura|Senator Entertainment,2007.0,June
4,4,12,6,(500) Days of Summer,PG13,Comedy|Drama|Romance,7.7,95 min,Marc Webb,Zooey Deschanel|Joseph Gordon-Levitt|Geoffrey Arend,USA,7-Aug-09,Fox Searchlight Pictures|Watermark|Dune Entertainment III,2009.0,August,(500) Days of Summer,PG13,Comedy,8,95 min,Marc Webb,Zooey Deschanel|Joseph Gordon-Levitt|Clark Gregg|Minka Kelly,USA,17-Jul-09,Watermark Productions,2009.0,July


In [10]:
if DEBUG:
    dbg = em.debug_blocker(D1, A, B, output_size=100)
    dbg.to_csv(datasets_dir + os.sep +'D1_block_dbg.csv')

### Applying Blocking Rule 2

In [11]:
# Instantiate overlap blocker object
ob = em.OverlapBlocker()

In [12]:
# for debugging only
if DEBUG:
    # Specify the tokenization to be 'word' level and set overlap_size to be 3.
    D2 = ob.block_tables(A, B, 'Directors', 'Directors', word_level=False, q_val=2, overlap_size=2,
                     l_output_attrs=['Title', 'Certificate', 'Genre','Rating','Running Time', 'Directors','Stars Cast','Country','Release Date','Production Company','Release Year','Release Month'],
                     r_output_attrs=['Title', 'Certificate', 'Genre','Rating','Running Time', 'Directors','Stars Cast','Country','Release Date','Production Company','Release Year','Release Month'],
                        show_progress=False)

else:
    D2 = ob.block_candset(D1,'Directors', 'Directors',word_level=False, q_val=2, overlap_size=2, show_progress=False)

In [13]:
len(D2)

2028

In [14]:
D2.head()

Unnamed: 0,_id,l_MID,r_MID,l_Title,l_Certificate,l_Genre,l_Rating,l_Running Time,l_Directors,l_Stars Cast,l_Country,l_Release Date,l_Production Company,l_Release Year,l_Release Month,r_Title,r_Certificate,r_Genre,r_Rating,r_Running Time,r_Directors,r_Stars Cast,r_Country,r_Release Date,r_Production Company,r_Release Year,r_Release Month
0,0,2,1,9,PG13,Animation|Action|Adventure,7.1,79 min,Shane Acker,Elijah Wood|Jennifer Connelly|Crispin Glover,USA,9-Sep-09,Focus Features|Relativity Media|Arc Productions,2009.0,September,9,PG13,Action|Science Fiction,8,79 min,Shane Acker,Christopher Plummer|Martin Landau|John C. Reilly|Crispin Glover,USA,9-Sep-09,Focus Features,2009.0,September
1,1,3,2,10,R,Comedy|Romance,6.0,122 min,Blake Edwards,Dudley Moore|Bo Derek|Julie Andrews,USA,5-Oct-79,Geoffrey Productions|Orion Pictures,1979.0,October,10,R,Comedy,8,121 min,Blake Edwards,Dudley Moore|Julie Andrews|Bo Derek|Robert Webber,USA,5-Oct-79,Orion|Warner Brothers,1979.0,October
3,3,7,3,1408,PG13,Drama|Horror,6.8,104 min,Mikael Hfstrm,John Cusack|Samuel L. Jackson|Mary McCormack,USA,22-Jun-07,Dimension Films|The Weinstein Company|Di Bonaventura Pictures,2007.0,June,1408,PG13,Horror,7,94 min,Mikael Hfstrm,John Cusack|Samuel L. Jackson|Mary McCormack|Jasmine Jessica Anthony,USA,22-Jun-07,Dimension Films|Lorenzo di Bonaventura|Senator Entertainment,2007.0,June
4,4,12,6,(500) Days of Summer,PG13,Comedy|Drama|Romance,7.7,95 min,Marc Webb,Zooey Deschanel|Joseph Gordon-Levitt|Geoffrey Arend,USA,7-Aug-09,Fox Searchlight Pictures|Watermark|Dune Entertainment III,2009.0,August,(500) Days of Summer,PG13,Comedy,8,95 min,Marc Webb,Zooey Deschanel|Joseph Gordon-Levitt|Clark Gregg|Minka Kelly,USA,17-Jul-09,Watermark Productions,2009.0,July
6,6,16,12,10 Cloverfield Lane,PG13,Drama|Horror|Mystery,7.2,103 min,Dan Trachtenberg,John Goodman|Mary Elizabeth Winstead|John Gallagher Jr.,USA,11-Mar-16,Paramount Pictures|Bad Robot|Spectrum Effects,2016.0,March,10 Cloverfield Lane,PG13,Science Fiction|Thriller,8,103 min,Dan Trachtenberg,"Mary Elizabeth Winstead|John Goodman|John Gallagher, Jr.|Maya Erskine",USA,11-Mar-16,Bad Robot,2016.0,March


In [15]:
if DEBUG:
    dbg = em.debug_blocker(D2, A, B, output_size=1000,n_jobs=4)
    dbg.to_csv(datasets_dir + os.sep +'D2_block_dbg.csv')

In [16]:
#saving the final output after blocking to do further process
D2.to_csv(datasets_dir + os.sep +'Blocked_output.csv')

## After we got the potential tuples after blocking step, we sampled 500 tuples, which were then labelled and then divided into I and J set for development and testing respectively.

In [17]:
#loading the output of blocking step
path_C = datasets_dir + os.sep + 'Blocked_output.csv'
C = em.read_csv_metadata(path_C, key='_id', 
                         fk_ltable='l_MID', fk_rtable='r_MID',
                         ltable=A, rtable=B)

In [18]:
len(C)

2028

In [19]:
#fixing a seed for reproducibility and sampling
pd.np.random.seed(100)
S = em.sample_table(C, 500)

In [20]:
# Label the sampled set
# Specify the name for the label column
G = em.label_table(S, 'gold_label')

  table.set_value(idxv[i], cols[j], val)


In [21]:
#writing G to a file for manual labelling
G.to_csv(datasets_dir + os.sep+'Labelled_data.csv')

## We now manually label all 500 samples and rename the file to avoid overwriting on labelled files.

In [22]:
# load the labelled data
path_C = datasets_dir + os.sep + 'Labelled_data_final.csv'
G = em.read_csv_metadata(path_C, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='l_MID', fk_rtable='r_MID')
len(G)

500

### 500 samples are now divided into set I and set J (70 and 30 % respectively). Set I is used to decide the best matcher using cross validation and set J is used to find the accuracy of best matcher once the matcher has been decided and fixed.

In [23]:
# Split S into I an J
IJ = em.split_train_test(G, train_proportion=0.7, random_state=500)
I = IJ['train']
J = IJ['test']

In [24]:
#writing I and J to files.
I.to_csv(datasets_dir + os.sep+'I_data.csv')
J.to_csv(datasets_dir + os.sep+'J_data.csv')

In [25]:
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)

In [26]:
block_c = em.get_attr_corres(A, B)

In [27]:
block_c['corres']

[('MID', 'MID'),
 ('Title', 'Title'),
 ('Certificate', 'Certificate'),
 ('Genre', 'Genre'),
 ('Rating', 'Rating'),
 ('Running Time', 'Running Time'),
 ('Directors', 'Directors'),
 ('Stars Cast', 'Stars Cast'),
 ('Country', 'Country'),
 ('Release Date', 'Release Date'),
 ('Production Company', 'Production Company'),
 ('Release Year', 'Release Year'),
 ('Release Month', 'Release Month')]

In [28]:
# selecting the attributes which we want to use to extract features

block_c['corres'] = [
 ('Title', 'Title'),
 ('Running Time', 'Running Time'),
 ('Directors', 'Directors'),
 ('Stars Cast', 'Stars Cast'),
 ('Release Year', 'Release Year'),
 ('Genre', 'Genre')]

In [29]:
# for matching 
tok = em.get_tokenizers_for_matching() 

In [30]:
#for matching
sim = em.get_sim_funs_for_matching()

In [31]:
sim

{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,
 'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function py_entitymatch

In [32]:
# get all features based on selected attributes
F = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)

In [33]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,Title_Title_jac_qgm_3_qgm_3,Title,Title,qgm_3,qgm_3,jaccard,<function Title_Title_jac_qgm_3_qgm_3 at 0x116073aa0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,Title_Title_cos_dlm_dc0_dlm_dc0,Title,Title,dlm_dc0,dlm_dc0,cosine,<function Title_Title_cos_dlm_dc0_dlm_dc0 at 0x116073578>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,Title_Title_jac_dlm_dc0_dlm_dc0,Title,Title,dlm_dc0,dlm_dc0,jaccard,<function Title_Title_jac_dlm_dc0_dlm_dc0 at 0x116073140>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,Title_Title_mel,Title,Title,,,monge_elkan,<function Title_Title_mel at 0x116073b18>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,Title_Title_lev_dist,Title,Title,,,lev_dist,<function Title_Title_lev_dist at 0x116073c80>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
5,Title_Title_lev_sim,Title,Title,,,lev_sim,<function Title_Title_lev_sim at 0x116073488>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
6,Title_Title_nmw,Title,Title,,,needleman_wunsch,<function Title_Title_nmw at 0x116073f50>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
7,Title_Title_sw,Title,Title,,,smith_waterman,<function Title_Title_sw at 0x116073668>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
8,Running_Time_Running_Time_jac_qgm_3_qgm_3,Running Time,Running Time,qgm_3,qgm_3,jaccard,<function Running_Time_Running_Time_jac_qgm_3_qgm_3 at 0x1160739b0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
9,Running_Time_Running_Time_cos_dlm_dc0_dlm_dc0,Running Time,Running Time,dlm_dc0,dlm_dc0,cosine,<function Running_Time_Running_Time_cos_dlm_dc0_dlm_dc0 at 0x1160738c0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [34]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
lr = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [35]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)

In [36]:
H.head()

Unnamed: 0,_id,l_MID,r_MID,Title_Title_jac_qgm_3_qgm_3,Title_Title_cos_dlm_dc0_dlm_dc0,Title_Title_jac_dlm_dc0_dlm_dc0,Title_Title_mel,Title_Title_lev_dist,Title_Title_lev_sim,Title_Title_nmw,Title_Title_sw,Running_Time_Running_Time_jac_qgm_3_qgm_3,Running_Time_Running_Time_cos_dlm_dc0_dlm_dc0,Running_Time_Running_Time_jac_dlm_dc0_dlm_dc0,Running_Time_Running_Time_mel,...,Running_Time_Running_Time_nmw,Running_Time_Running_Time_sw,Directors_Directors_jac_qgm_3_qgm_3,Directors_Directors_cos_dlm_dc0_dlm_dc0,Directors_Directors_jac_dlm_dc0_dlm_dc0,Directors_Directors_mel,Directors_Directors_lev_dist,Directors_Directors_lev_sim,Directors_Directors_nmw,Directors_Directors_sw,Release_Year_Release_Year_exm,Release_Year_Release_Year_anm,Release_Year_Release_Year_lev_dist,Release_Year_Release_Year_lev_sim,gold_label
186,1604,1738,1856,0.384615,0.666667,0.5,0.893961,9.0,0.653846,15.0,16.0,0.454545,0.5,0.333333,0.9,...,5.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0,18.0,18.0,0.0,0.999006,1.0,0.833333,0
418,4304,2529,2812,0.47619,0.666667,0.5,0.926374,4.0,0.714286,9.0,10.0,0.307692,0.5,0.333333,0.849206,...,3.0,4.0,0.277778,0.5,0.333333,0.853333,4.0,0.6,5.0,6.0,,,,,0
472,4159,2650,2779,0.3,0.5,0.333333,0.828095,6.0,0.5,4.0,4.0,0.384615,0.5,0.333333,0.828571,...,5.0,5.0,0.0,0.0,0.0,0.432372,14.0,0.125,-1.0,2.0,0.0,0.991538,4.0,0.333333,0
182,1577,1710,1836,1.0,1.0,1.0,1.0,0.0,1.0,11.0,11.0,0.5,0.5,0.333333,0.92381,...,6.0,6.0,1.0,1.0,1.0,1.0,0.0,1.0,15.0,15.0,1.0,1.0,0.0,1.0,1
268,2285,2154,2316,0.333333,0.5,0.333333,0.853968,4.0,0.555556,3.0,5.0,0.307692,0.5,0.333333,0.746032,...,3.0,4.0,0.0,0.0,0.0,0.513558,16.0,0.111111,0.0,2.0,0.0,0.99256,2.0,0.666667,0


In [37]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [38]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
                strategy='mean')

In [39]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, lg, lr, nb], table=H, 
        exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
        k=5,
        target_attr='gold_label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']



Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.979144,0.961538,0.968012
1,RF,0.990909,0.961538,0.974072
2,SVM,0.962609,0.871179,0.911405
3,LogReg,0.960386,0.961538,0.957868
4,LinReg,0.956727,0.976923,0.966217
5,NaiveBayes,0.982576,0.961538,0.969817


In [40]:
# result['drill_down_cv_stats']['precision']

In [41]:
# result['drill_down_cv_stats']['recall']

In [42]:
# result['drill_down_cv_stats']['f1']

In [43]:
# this part of the code was used to debug the selected matcher
# # Split H into P and Q
# PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
# P = PQ['train']
# Q = PQ['test']

In [44]:
# # Debug RF matcher using GUI
# em.vis_debug_rf(rf, P, Q, 
#         exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
#         target_attr='gold_label')

## Once the matcher was selected based on the cross validation accuracy, we now use the selected matcher to train the model on I set and test it on J set.

In [45]:
# Train using feature vectors from I 
rf.fit(table=H, 
        exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='gold_label', show_progress=False)

# Impute feature vectors with the mean of the column values.
L = em.impute_table(L, 
                exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
                strategy='mean')


# Predict on L 
predictions = rf.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

In [46]:
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 97.67% (42/43)
Recall : 95.45% (42/44)
F1 : 96.55%
False positives : 1 (out of 43 positive predictions)
False negatives : 2 (out of 107 negative predictions)


In [47]:
# testing set J on all remaining matchers:

# Train using feature vectors from I 
dt.fit(table=H, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

svm.fit(table=H, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

lg.fit(table=H, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

lr.fit(table=H, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

nb.fit(table=H, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
       target_attr='gold_label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='gold_label', show_progress=False)
# Impute feature vectors with the mean of the column values.
L = em.impute_table(L, 
                exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
                strategy='mean')


# Predict on L 
predictionsDT = dt.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

predictionsSVM = svm.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False)

predictionsLG = lg.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')
predictionsLR = lr.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')
predictionsNB = nb.predict(table=L, exclude_attrs=['_id', 'l_MID', 'r_MID', 'gold_label'],
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')



In [48]:
# Evaluate the predictions
eval_result = em.eval_matches(predictionsDT, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)

# Evaluate the predictions
eval_result = em.eval_matches(predictionsSVM, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)

# Evaluate the predictions
eval_result = em.eval_matches(predictionsLG, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)

# Evaluate the predictions
eval_result = em.eval_matches(predictionsLR, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)

# Evaluate the predictions
eval_result = em.eval_matches(predictionsNB, 'gold_label', 'predicted')
em.print_eval_summary(eval_result)


Precision : 91.3% (42/46)
Recall : 95.45% (42/44)
F1 : 93.33%
False positives : 4 (out of 46 positive predictions)
False negatives : 2 (out of 104 negative predictions)
Precision : 90.7% (39/43)
Recall : 88.64% (39/44)
F1 : 89.66%
False positives : 4 (out of 43 positive predictions)
False negatives : 5 (out of 107 negative predictions)
Precision : 91.67% (44/48)
Recall : 100.0% (44/44)
F1 : 95.65%
False positives : 4 (out of 48 positive predictions)
False negatives : 0 (out of 102 negative predictions)
Precision : 93.62% (44/47)
Recall : 100.0% (44/44)
F1 : 96.7%
False positives : 3 (out of 47 positive predictions)
False negatives : 0 (out of 103 negative predictions)
Precision : 89.36% (42/47)
Recall : 95.45% (42/44)
F1 : 92.31%
False positives : 5 (out of 47 positive predictions)
False negatives : 2 (out of 103 negative predictions)
