## Data Preprocessing for Amazon-Google datasets

In [1]:
import pandas as pd
import numpy as np
import hashlib
import re
import random as rd
import datetime
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein

import sys
sys.path.append('../')

import EmbDI.data_preprocessing as dp

from collections import Counter
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/spoutnik23/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Loading the Amazon dataset

In [2]:
f1 = '../pipeline/experiments/amazon-google/exp_data/tableA.csv'
df1 = pd.read_csv(f1, encoding='utf-8')
print(df1.shape)
df1.head(5)

(1363, 4)


Unnamed: 0,id,title,manufacturer,price
0,0,clickart 950 000 premier image pack ( dvd-rom ),broderbund,
1,1,ca international arcserve lap/desktop oem 30pk,computer associates,
2,2,noah 's ark activity center ( jewel case ages ...,victory multimedia,
3,3,peachtree by sage premium accounting for nonpr...,sage software,599.99
4,4,singing coach unlimited,carry-a-tune technologies,99.99


#### Loading the Amazon dataset

In [3]:
f2 = '../pipeline/experiments/amazon-google/exp_data/tableB.csv'
df2 = pd.read_csv(f2, encoding='utf-8')
print(df2.shape)
df2.head(5)

(3226, 4)


Unnamed: 0,id,title,manufacturer,price
0,0,learning quickbooks 2007,intuit,38.99
1,1,superstart ! fun with reading & writing !,,8.49
2,2,qb pos 6.0 basic software,intuit,637.99
3,3,math missions : the amazing arcade adventure (...,,12.95
4,4,production prem cs3 mac upgrad,adobe software,805.99


#### Setting the Data Preprocessing parameters for the basic case

In [4]:
parameters = {
    'output_file': 'amazon-google',
    'concatenate': 'outer',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'auto_merge': False,
    'expand_columns': ','.join(list(set(df1.columns))),
    'tokenize_shared': False 
}

df_c = dp.data_preprocessing([df1, df2], parameters)
df_c = df_c.drop('id', axis=1)

df_c.to_csv('../pipeline/datasets/amazon_google/amazon_google-master.csv', index=False)

In [6]:
dp.write_info_file([df1, df2], 'info-amazon_google', [f1,f2])

#### Generating the schema matching dataset in the basic case. 

In [16]:
parameters = {
    'output_file': 'amazon-google',
    'concatenate': 'horizon',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'auto_merge': False,
    'expand_columns': ','.join(list(set(df1.columns))),
    'tokenize_shared': False 
}

df_c = dp.data_preprocessing([df1.drop('id', axis=1), df2.drop('id', axis=1)], parameters)

df_c.to_csv('../pipeline/datasets/amazon_google/amazon_google-master-sm.csv', index=False)

In [6]:
dp.write_info_file([df1, df2], 'info-amazon_google', [f1,f2])

#### Setting the Data Preprocessing parameters for the heuristic case

In [7]:
parameters = {
    'output_file': 'amazon-google',
    'concatenate': 'outer',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'auto_merge': False,
    'tokenize_shared': True 
}

df_c = dp.data_preprocessing([df1, df2], parameters)
df_c = df_c.drop('id', axis=1)
df_c.to_csv('../pipeline/datasets/amazon_google/amazon_google-heuristic.csv', index=False)

#### Generating the schema matching dataset in the heuristic case. 

In [14]:
parameters = {
    'output_file': 'amazon-google',
    'concatenate': 'horizon',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'auto_merge': False,
    'tokenize_shared': True 
}

df_c = dp.data_preprocessing([df1.drop('id', axis=1), df2.drop('id', axis=1)], parameters)
# df_c = df_c.drop('id', axis=1)
df_c.to_csv('../pipeline/datasets/amazon_google/amazon_google-heuristic-sm.csv', index=False)

#### Prepare the ER match file

In [12]:
import os
import os.path

tot_m = 0
dir_path = '../pipeline/experiments/amazon-google/exp_data/'
with open('../pipeline/matches/matches-amazon_google.txt', 'w') as fo:
    for file in os.listdir(dir_path):
        if file not in [os.path.basename(_) for _ in [f1, f2]]:        
            print(file)
            m = 0
            with open(dir_path + file, 'r') as fp:
                for idx, line in enumerate(fp):
                    m1, m2, flag = line.rstrip().rsplit(',')
                    if flag == '1':
                        s = 'idx_{0},idx_{1}\n'.format(m1, str(int(m2) + len(df1)))
                        fo.write(s)
                        m+=1
            print('File {}: {} matches.'.format(file, m))
            tot_m+=m
print('Total matches: {}'.format(tot_m))

test.csv
File test.csv: 234 matches.
train.csv
File train.csv: 699 matches.
valid.csv
File valid.csv: 234 matches.
Total matches: 1167


In [18]:
df_c.columns

Index(['0_manufacturer', '0_price', '0_title', '1_manufacturer', '1_price',
       '1_title'],
      dtype='object')

In [None]:
with open('../pipeline/matches/sm_matches-amazon_google.txt'):
    

### Prepare basic config file

In [16]:
pars = '''smoothing_method:no
window_size:3
n_dimensions:300
sentence_length:60
walks_strategy:basic
ntop:10
ncand:1
max_rank:3
learning_method:skipgram
training_algorithm:word2vec
n_sentences:default
experiment_type:ER
task:train-test
with_cid:all
with_rid:first
numeric:no
backtrack:True
match_file:
write_walks:True
output_file:
input_file:
dataset_info:
test_dir:
flatten:false
embeddings_file:
intersection:true'''.split('\n')

parameters = {_.split(':')[0]: _.split(':')[1] for _ in pars}

In [17]:
parameters['input_file'] = 'pipeline/datasets/amazon_google/{}'.format('amazon_google-heuristic.csv')
parameters['match_file'] = 'pipeline/matches/matches-{}'.format('amazon_google.txt')
parameters['dataset_info'] = 'pipeline/info/info-{}'.format('amazon_google')

In [18]:
with open('../pipeline/config_files/amazon_google/amazon_google-ER-noflatten-int', 'w') as fp:
    for k,v in parameters.items():
        s = '{}:{}\n'.format(k,v)
        fp.write(s)