## Imports

In [1]:
import pandas as pd
from pyjavaproperties import Properties

## Settings to enable real-time output from a shell command

In [2]:
from subprocess import Popen, PIPE, STDOUT
from IPython.core.magic import register_line_magic

@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()

# 1. Self Join

## 1.a Preview the input file

In [3]:
df = pd.read_csv('../data/dblp_authors_test.csv', sep=';', header=None, names=['Paper_ID', 'Authors'])
df.head()

Unnamed: 0,Paper_ID,Authors
0,SIGMOD2014,"Dong+Deng,Guoliang+Li,Jianhua+Feng"
1,ICDE2013,"Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
2,VLDB2016a,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng"
3,SIGMOD2015,"Guoliang+Li,Jian+He,Dong+Deng,Jian+Li"
4,VLDB2014,"Yu+Jiang,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"


## 1.b Load and optionally edit the config file

In [4]:
config_file_example = '../config.properties.example'
config_file = '../config.properties'
input_file = '../data/dblp_authors_test.csv'
output_file = '../data/output/dblp_authors_selfjoin_out.csv'
stats_file = '../data/output/stats.csv'

In [5]:
params = Properties()
params.load(open(config_file_example))
params.list()

-- listing properties --
mode=standard
operation=search
query_file=data/test_input_R.csv
input_file=data/test_input_S.csv
query_id=0
max_lines=-1
fuzzyset_column=1
set_column=2
tokens_column=3
column_delimiter=;
token_delimiter=,
header=false
return_counts=true
output_file=out.txt
stats_file=stats.txt
sim_threshold=0.8
k=3


In [6]:
params['operation'] = 'self-join'
params['sim_threshold'] = '0.33'
params['input_file'] = input_file
params['output_file'] = output_file
params['stats_file'] = stats_file
params['set_column'] = '1'
params['tokens_column'] = '2'
params['return_counts'] = 'false'

In [7]:
params.list()

-- listing properties --
mode=standard
operation=self-join
query_file=data/test_input_R.csv
input_file=../data/dblp_authors_test.csv
query_id=0
max_lines=-1
fuzzyset_column=1
set_column=1
tokens_column=2
column_delimiter=;
token_delimiter=,
header=false
return_counts=false
output_file=../data/output/dblp_authors_selfjoin_out.csv
stats_file=../data/output/stats.csv
sim_threshold=0.33
k=3


In [8]:
params.store(open(config_file, 'w'))

## 1.c Execute the join operation

In [9]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 3.9444444444444446
Transform time: 0.017376123 sec.
Join time: 0.011270184 sec.
Total Matches: 20


## 1.d Load the results

In [10]:
out1 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out1['Authors_1'] = df.set_index('Paper_ID').Authors[out1.Paper_ID_1].reset_index(drop=True)
out1['Authors_2'] = df.set_index('Paper_ID').Authors[out1.Paper_ID_2].reset_index(drop=True)
out1.sort_values('Similarity', ascending=False).head(20)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Authors_1,Authors_2
2,SIGMOD2014,ICDE2013,0.75,"Dong+Deng,Guoliang+Li,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
3,VLDB2016a,SIGMOD2014,0.75,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
17,VLDB2011,SIGMOD2012b,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Jiannan+Wang,Guoliang+Li,Jianhua+Feng"
16,VLDB2011,SIGMOD2014,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
18,VLDB2011,ICDE2013,0.6,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
7,VLDB2016a,VLDB2011,0.6,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng"
8,VLDB2016a,ICDE2013,0.6,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
14,VLDB2014,ICDE2013,0.6,"Yu+Jiang,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
1,SIGMOD2014,SIGMOD2012b,0.5,"Dong+Deng,Guoliang+Li,Jianhua+Feng","Jiannan+Wang,Guoliang+Li,Jianhua+Feng"
0,SIGMOD2012b,ICDE2013,0.4,"Jiannan+Wang,Guoliang+Li,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"


# 2. Self Closest Pairs


## 2.a Load and optionally edit the config file

In [11]:
output_file = '../data/output/dblp_authors_selfclosestpairs_out.csv'

params['operation'] = 'self-closest-pairs'
params['k'] = '5'
params['input_file'] = input_file
params['output_file'] = output_file
params['stats_file'] = stats_file
params['set_column'] = '1'
params['tokens_column'] = '2'
params['return_counts'] = 'false'

In [12]:
params.store(open(config_file, 'w'))

## 2.b Execute the join operation

In [13]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 3.9444444444444446
Transform time: 0.016934491 sec.
Candidates: 8 Verified pairs: 8 Final threshold: 0.6
Join time: 0.013758427 sec.
Total Matches: 5


## 2.c Load the results

In [14]:
out2 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out2['Authors_1'] = df.set_index('Paper_ID').Authors[out2.Paper_ID_1].reset_index(drop=True)
out2['Authors_2'] = df.set_index('Paper_ID').Authors[out2.Paper_ID_2].reset_index(drop=True)
out2.sort_values('Similarity', ascending=False).head(5)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Authors_1,Authors_2
0,VLDB2016a,SIGMOD2014,0.75,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
1,VLDB2011,SIGMOD2012b,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Jiannan+Wang,Guoliang+Li,Jianhua+Feng"
2,VLDB2011,SIGMOD2014,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
3,ICDE2013,SIGMOD2014,0.75,"Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li","Dong+Deng,Guoliang+Li,Jianhua+Feng"
4,ICDE2013,VLDB2014,0.6,"Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li","Yu+Jiang,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"


In [15]:
out1.sort_values('Similarity', ascending=False).head(20)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Authors_1,Authors_2
2,SIGMOD2014,ICDE2013,0.75,"Dong+Deng,Guoliang+Li,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
3,VLDB2016a,SIGMOD2014,0.75,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
17,VLDB2011,SIGMOD2012b,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Jiannan+Wang,Guoliang+Li,Jianhua+Feng"
16,VLDB2011,SIGMOD2014,0.75,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng"
18,VLDB2011,ICDE2013,0.6,"Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
7,VLDB2016a,VLDB2011,0.6,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Guoliang+Li,Dong+Deng,Jiannan+Wang,Jianhua+Feng"
8,VLDB2016a,ICDE2013,0.6,"Dong+Deng,Guoliang+Li,He+Wen,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
14,VLDB2014,ICDE2013,0.6,"Yu+Jiang,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
1,SIGMOD2014,SIGMOD2012b,0.5,"Dong+Deng,Guoliang+Li,Jianhua+Feng","Jiannan+Wang,Guoliang+Li,Jianhua+Feng"
0,SIGMOD2012b,ICDE2013,0.4,"Jiannan+Wang,Guoliang+Li,Jianhua+Feng","Dong+Deng,Guoliang+Li,Jianhua+Feng,Wen+Syan+Li"
