In [1]:
import py_entitymatching as em
import pandas as pd
import os, sys
import numpy as np

In [2]:
ltable = pd.read_csv('ltable.csv', encoding='latin-1')
rtable = pd.read_csv('rtable.csv', encoding='latin-1')

In [3]:
ltableData = em.read_csv_metadata("ltable.csv", encoding='latin-1')
rtableData = em.read_csv_metadata("rtable.csv", encoding='latin-1')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [4]:
em.set_key(ltableData, 'ltable_id')
em.set_key(rtableData, 'rtable_id')

True

In [5]:
train = em.read_csv_metadata("train.csv", 
                         key='id',
                         ltable=ltableData, rtable=rtableData, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.


In [6]:
F = em.get_features_for_matching(ltableData, rtableData, validate_inferred_attr_types=False)

In [7]:
train_table = em.extract_feature_vecs(train, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [8]:
import random
random_state = 0 

dt = em.DTMatcher(name='DecisionTree', random_state=random_state)
rf = em.RFMatcher(name='RF', random_state=random_state)
svm = em.SVMMatcher(name='SVM', random_state=random_state)
ln = em.LinRegMatcher(name='LinReg')
lg = em.LogRegMatcher(name='LogReg', random_state=random_state)
nb = em.NBMatcher(name = 'NaiveBayes')
xg = em.XGBoostMatcher(name = 'XGBoost')

In [9]:
# Impute feature vectors with the mean of the column values.
train_table = em.impute_table(train_table, 
                exclude_attrs=['id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')



In [10]:
#initial results
result = em.select_matcher([dt, rf, svm, ln, lg, nb, xg], table=train_table, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.586133,0.561569,0.572245
1,RF,0.803232,0.487055,0.605659
2,SVM,0.904368,0.335341,0.48921
3,LinReg,0.972177,0.438797,0.60351
4,LogReg,0.943515,0.452186,0.609692
5,NaiveBayes,0.478641,0.54463,0.507736
6,XGBoost,0.948509,0.465426,0.622736


In [11]:
test = em.read_csv_metadata("test.csv", 
                         key='id',
                         ltable=ltableData, rtable=rtableData, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.


In [12]:
test_table = em.extract_feature_vecs(test, feature_table=F, show_progress=False)

In [13]:
test_table = em.impute_table(test_table, 
                exclude_attrs=['id', 'ltable_id', 'rtable_id'],
                strategy='mean')



In [14]:
classifiers = np.array([dt, rf, svm, ln, lg, nb, xg])
i = 1
res = []
for c in classifiers:
    
    c.fit(table=train_table, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'label'], target_attr='label')
    
    
    predictions = c.predict(table=test_table, exclude_attrs=['id', 'ltable_id', 'rtable_id'], 
                             append=True, target_attr='label', inplace=False)
    
    # predictions[['id', 'label']].to_csv("submission"+str(i)+".csv", index=False)
    i += 1
    res.append(predictions[['id', 'label']])



In [15]:
# results only using xgboost matcher, best result I got
res[6].to_csv('xg_res.csv', float_format='%.f', index=False)

In [16]:
emsembled_res = pd.DataFrame(columns = ['id', 'label'])

In [17]:
emsembled_res['id'] = res[0]['id']
emsembled_res['label'] = 0

In [18]:
w = [0,0,0,1,1,0,1]
for i in range(7):
    emsembled_res['label'] += res[i]['label'] * w[i]

In [19]:
emsembled_res['label'] = round(emsembled_res['label'] / sum(w))

In [20]:
# results using voting from different matcher
emsembled_res.to_csv('ensemble_res.csv', float_format='%.f', index=False)