## Imports

In [1]:
import pandas as pd
from pyjavaproperties import Properties

## Settings to enable real-time output from a shell command

In [2]:
from subprocess import Popen, PIPE, STDOUT
from IPython.core.magic import register_line_magic

@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()

# 1. Self Join

## 1.a Preview the input file

In [3]:
df = pd.read_csv('../data/dblp_papers_test.csv', sep=';', header=None, names=['Paper_ID', 'Title'])
df.head()

Unnamed: 0,Paper_ID,Title
0,SIGMOD2014,"pivotal,prefix,based,filtering,algorithm,strin..."
1,ICDE2013,"top-k,string,similarity,search,edit,distance,c..."
2,VLDB2016a,"efficient,partition,based,method,exact,set,sim..."
3,SIGMOD2015,"efficient,similarity,join,search,multi,attribu..."
4,VLDB2014,"string,similarity,joins,experimental,evaluation"


## 1.b Load and optionally edit the config file

In [4]:
config_file_example = '../config.properties.example'
config_file = '../config.properties'
input_file = '../data/dblp_papers_test.csv'
output_file = '../data/output/dblp_papers_selfjoin_out.csv'
stats_file = '../data/output/stats.csv'

In [5]:
params = Properties()
params.load(open(config_file_example))
params.list()

-- listing properties --
mode=standard
operation=search
query_file=data/test_input_R.csv
input_file=data/test_input_S.csv
query_id=0
max_lines=-1
fuzzyset_column=1
set_column=2
tokens_column=3
column_delimiter=;
token_delimiter=,
header=false
return_counts=true
output_file=out.txt
stats_file=stats.txt
sim_threshold=0.8
k=3


In [6]:
params['operation'] = 'self-join'
params['sim_threshold'] = '0.33'
params['input_file'] = input_file
params['output_file'] = output_file
params['stats_file'] = stats_file
params['set_column'] = '1'
params['tokens_column'] = '2'
params['return_counts'] = 'false'

In [7]:
params.list()

-- listing properties --
mode=standard
operation=self-join
query_file=data/test_input_R.csv
input_file=../data/dblp_papers_test.csv
query_id=0
max_lines=-1
fuzzyset_column=1
set_column=1
tokens_column=2
column_delimiter=;
token_delimiter=,
header=false
return_counts=false
output_file=../data/output/dblp_papers_selfjoin_out.csv
stats_file=../data/output/stats.csv
sim_threshold=0.33
k=3


In [8]:
params.store(open(config_file, 'w'))

## 1.c Execute the join operation

In [9]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 6.222222222222222
Transform time: 0.030311578 sec.
Join time: 0.017877667 sec.
Total Matches: 11


## 1.d Load the results

In [10]:
out1 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out1['Title_1'] = df.set_index('Paper_ID').Title[out1.Paper_ID_1].reset_index(drop=True)
out1['Title_2'] = df.set_index('Paper_ID').Title[out1.Paper_ID_2].reset_index(drop=True)
out1.sort_values('Similarity', ascending=False).head(11)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
10,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
1,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
2,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
4,VLDB2017,VLDB2016a,0.4,"leveraging,set,relations,exact,set,similarity,...","efficient,partition,based,method,exact,set,sim..."
5,VLDB2011,SIGMOD2012a,0.375,"pass,join,partition,based,method,similarity,join","exploiting,mapreduce,based,similarity,join"
8,VLDB2018,SIGMOD2012a,0.375,"set,similarity,join,mapreduce,experimental,survey","exploiting,mapreduce,based,similarity,join"
9,SIGMOD2010a,ICDE2009,0.375,"efficient,parallel,set,similarity,joins,using,...","top-k,set,similarity,joins"
0,VLDB2012a,SIGMOD2010b,0.333333,"spatio,textual,similarity,join","probabilistic,string,similarity,join"
3,VLDB2017,VLDB2016b,0.333333,"leveraging,set,relations,exact,set,similarity,...","empirical,evaluation,set,similarity,join,techn..."
6,VLDB2018,VLDB2017,0.333333,"set,similarity,join,mapreduce,experimental,survey","leveraging,set,relations,exact,set,similarity,..."


# 2. Self Closest Pairs

## 2.a Load and optionally edit the config file

In [11]:
output_file = '../data/output/dblp_papers_selfclosestpairs_out.csv'

params['operation'] = 'self-closest-pairs'
params['k'] = '5'
params['input_file'] = input_file
params['output_file'] = output_file
params['stats_file'] = stats_file
params['set_column'] = '1'
params['tokens_column'] = '2'
params['return_counts'] = 'false'

In [12]:
params.store(open(config_file, 'w'))

## 2.b Execute the join operation

In [13]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 6.222222222222222
Transform time: 0.020710181 sec.
Candidates: 20 Verified pairs: 14 Final threshold: 0.375
Join time: 0.022263484 sec.
Total Matches: 5


## 2.c Load the results

In [14]:
out2 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out2['Title_1'] = df.set_index('Paper_ID').Title[out2.Paper_ID_1].reset_index(drop=True)
out2['Title_2'] = df.set_index('Paper_ID').Title[out2.Paper_ID_2].reset_index(drop=True)
out2.sort_values('Similarity', ascending=False).head()

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
3,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
0,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
1,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
4,VLDB2016a,VLDB2017,0.4,"efficient,partition,based,method,exact,set,sim...","leveraging,set,relations,exact,set,similarity,..."
2,VLDB2018,SIGMOD2012a,0.375,"set,similarity,join,mapreduce,experimental,survey","exploiting,mapreduce,based,similarity,join"


In [15]:
out1.sort_values("Similarity", ascending=False).head(10)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
10,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
1,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
2,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
4,VLDB2017,VLDB2016a,0.4,"leveraging,set,relations,exact,set,similarity,...","efficient,partition,based,method,exact,set,sim..."
5,VLDB2011,SIGMOD2012a,0.375,"pass,join,partition,based,method,similarity,join","exploiting,mapreduce,based,similarity,join"
8,VLDB2018,SIGMOD2012a,0.375,"set,similarity,join,mapreduce,experimental,survey","exploiting,mapreduce,based,similarity,join"
9,SIGMOD2010a,ICDE2009,0.375,"efficient,parallel,set,similarity,joins,using,...","top-k,set,similarity,joins"
0,VLDB2012a,SIGMOD2010b,0.333333,"spatio,textual,similarity,join","probabilistic,string,similarity,join"
3,VLDB2017,VLDB2016b,0.333333,"leveraging,set,relations,exact,set,similarity,...","empirical,evaluation,set,similarity,join,techn..."
6,VLDB2018,VLDB2017,0.333333,"set,similarity,join,mapreduce,experimental,survey","leveraging,set,relations,exact,set,similarity,..."
