## Data Preprocessing for Fodors-Zagats datasets

In [22]:
import pandas as pd
import numpy as np
import hashlib
import re
import random as rd
import datetime
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein

import sys
sys.path.append('../')

import EmbDI.data_preprocessing as dp

from collections import Counter
import matplotlib.pyplot as plt

#### Loading the Fodors dataset

In [23]:
f1 = '../pipeline/experiments/fodors-zagats/exp_data/tableA.csv'
df1 = pd.read_csv(f1, encoding='utf-8')
print(df1.shape)
df1.head(5)

(533, 7)


Unnamed: 0,id,name,addr,city,phone,type,class
0,0,` arnie morton \ 's of chicago ',' 435 s. la cienega blv . ',` los angeles ',310/246 -1501,american,0
1,1,` art \ 's delicatessen ',' 12224 ventura blvd. ',` studio city ',818/762 -1221,american,1
2,2,` hotel bel-air ',' 701 stone canyon rd. ',` bel air ',310/472 -1211,californian,2
3,3,` cafe bizou ',' 14016 ventura blvd. ',` sherman oaks ',818/788 -3536,french,3
4,4,campanile,' 624 s. la brea ave. ',` los angeles ',213/938 -1447,american,4


#### Loading the Zagats dataset

In [24]:
f2 = '../pipeline/experiments/fodors-zagats/exp_data/tableB.csv'
df2 = pd.read_csv(f2, encoding='utf-8')
print(df2.shape)
df2.head(5)

(331, 7)


Unnamed: 0,id,name,addr,city,phone,type,class
0,0,` apple pan the ',' 10801 w. pico blvd. ',` west la ',310-475-3585,american,534
1,1,` asahi ramen ',' 2027 sawtelle blvd. ',` west la ',310-479-2231,` noodle shops ',535
2,2,` baja fresh ',' 3345 kimber dr. ',` westlake village ',805-498-4049,mexican,536
3,3,` belvedere the ',' 9882 little santa monica blvd. ',` beverly hills ',310-788-2306,` pacific new wave ',537
4,4,` benita \ 's frites ',' 1433 third st. promenade ',` santa monica ',310-458-2889,` fast food ',538


#### Setting the Data Preprocessing parameters for the basic case

In [29]:
parameters = {
    'output_file': 'fodors-zagats',
    'concatenate': 'outer',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'expand_columns': ','.join(list(set(df1.columns))),
    'auto_merge': False,
    'tokenize_shared': False 
}
print(parameters['expand_columns'])

id,city,name,type,class,phone,addr


In [30]:
df_c = dp.data_preprocessing([df1, df2], parameters)

df_c.shape

df_c = df_c.drop('id', axis=1)

#### Write the master dataset on file

In [31]:
df_c.to_csv('../pipeline/datasets/fodors_zagats/fodors_zagats-master.csv', index=False)

dp.write_info_file([df1, df2], 'info-fodors_zagats', [f1,f2])

#### Setting the Data Preprocessing parameters for the heuristic

In [5]:
parameters = {
    'output_file': 'fodors-zagats',
    'concatenate': 'outer',
    'missing_value': 'nan,ukn,none,unknown,',
    'missing_value_strategy': '',
    'round_number': 0,
    'round_columns': 'price',
    'auto_merge': False,
    'tokenize_shared': True 
}

df_c = dp.data_preprocessing([df1, df2], parameters)

df_c.shape

df_c = df_c.drop('id', axis=1)

#### Write the heuristic dataset on file

In [21]:
df_c.to_csv('../pipeline/datasets/fodors_zagats/fodors_zagats-heuristic.csv', index=False)

#### Prepare the match file

In [8]:
import os
import os.path

tot_m = 0
dir_path = '../pipeline/experiments/fodors-zagats/exp_data/'
with open('../pipeline/matches/matches-fodors_zagats.txt', 'w') as fo:
    for file in os.listdir(dir_path):
        if file not in [os.path.basename(_) for _ in [f1, f2]]:        
            print(file)
            m = 0
            with open(dir_path + file, 'r') as fp:
                for idx, line in enumerate(fp):
                    m1, m2, flag = line.rstrip().rsplit(',')
                    if flag == '1':
                        s = 'idx_{0},idx_{1}\n'.format(m1, str(int(m2) + len(df1)))
                        fo.write(s)
                        m+=1
            print('File {}: {} matches.'.format(file, m))
            tot_m+=m
print('Total matches: {}'.format(tot_m))

test.csv
File test.csv: 22 matches.
train.csv
File train.csv: 66 matches.
valid.csv
File valid.csv: 22 matches.
Total matches: 110


In [9]:
df_p = df_c.copy()

### Prepare basic config file

In [15]:
pars = '''smoothing_method:no
window_size:3
n_dimensions:300
sentence_length:60
walks_strategy:basic
ntop:10
ncand:1
max_rank:3
learning_method:skipgram
training_algorithm:word2vec
n_sentences:default
experiment_type:ER
task:train-test
with_cid:all
with_rid:first
numeric:no
backtrack:True
match_file:
write_walks:True
output_file:
input_file:
dataset_info:
test_dir:
flatten:false
embeddings_file:
intersection:true'''.split('\n')

parameters = {_.split(':')[0]: _.split(':')[1] for _ in pars}

In [16]:
parameters['input_file'] = 'pipeline/datasets/fodors_zagats/{}'.format('fodors_zagats-heuristic.csv')
parameters['match_file'] = 'pipeline/matches/matches-{}'.format('fodors_zagats.txt')
parameters['dataset_info'] = 'pipeline/info/info-{}'.format('fodors_zagats')

In [17]:
with open('../pipeline/config_files/fodors_zagats/fodors_zagats-ER-noflatten-int', 'w') as fp:
    for k,v in parameters.items():
        s = '{}:{}\n'.format(k,v)
        fp.write(s)