## Imports

In [1]:
import pandas as pd
import json

## Settings to enable real-time output from a shell command

In [2]:
from subprocess import Popen, PIPE, STDOUT
from IPython.core.magic import register_line_magic

@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()

# 1. Self Join

## 1.a Preview the input file

In [3]:
df = pd.read_csv('../data/dblp_papers_test.csv', sep=';', header=None, names=['Paper_ID', 'Title'])
df.head()

Unnamed: 0,Paper_ID,Title
0,SIGMOD2014,"pivotal,prefix,based,filtering,algorithm,strin..."
1,ICDE2013,"top-k,string,similarity,search,edit,distance,c..."
2,VLDB2016a,"efficient,partition,based,method,exact,set,sim..."
3,SIGMOD2015,"efficient,similarity,join,search,multi,attribu..."
4,VLDB2014,"string,similarity,joins,experimental,evaluation"


## 1.b Load and optionally edit the config file

In [4]:
config_file_example = '../config.json.example'
config_file = '../config.json'
input_file = '../data/dblp_papers_test.csv'
output_file = '../data/output/dblp_papers_selfjoin_out.csv'
log_file = '../data/output/log.txt'

In [5]:
params = json.load(open(config_file_example))
params

{'query_file': 'data/test_input_R.csv',
 'input_file': 'data/test_input_S.csv',
 'max_lines': '-1',
 'set_column': '1',
 'elements_column': '2',
 'tokens_column': '3',
 'column_delimiter': ';',
 'token_delimiter': ',',
 'header': 'false',
 'output_file': 'out.txt',
 'mode': 'standard',
 'join_type': 'threshold',
 'threshold': '0.5',
 'k': '3'}

In [6]:
params['join_type'] = 'threshold'
params['threshold'] = '0.33'
params['input_file'] = input_file
del(params['query_file'])
params['output_file'] = output_file
params['log_file'] = log_file
params['set_column'] = '1'
params['tokens_column'] = '2'

In [7]:
params

{'input_file': '../data/dblp_papers_test.csv',
 'max_lines': '-1',
 'set_column': '1',
 'elements_column': '2',
 'tokens_column': '2',
 'column_delimiter': ';',
 'token_delimiter': ',',
 'header': 'false',
 'output_file': '../data/output/dblp_papers_selfjoin_out.csv',
 'mode': 'standard',
 'join_type': 'threshold',
 'threshold': '0.33',
 'k': '3',
 'log_file': '../data/output/log.txt'}

In [8]:
json.dump(params, open(config_file, 'w'), indent=4)

## 1.c Execute the join operation

In [9]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 6.222222222222222
Read time: 0.002008185 sec.
Transform time: 0.011628722 sec.
Collection size: 18
Join time: 0.017142176 sec.
Number of matches: 11


## 1.d Load the results

In [10]:
out1 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out1['Title_1'] = df.set_index('Paper_ID').Title[out1.Paper_ID_1].reset_index(drop=True)
out1['Title_2'] = df.set_index('Paper_ID').Title[out1.Paper_ID_2].reset_index(drop=True)
out1.sort_values('Similarity', ascending=False).head(11)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
10,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
1,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
2,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
9,VLDB2016a,VLDB2017,0.4,"efficient,partition,based,method,exact,set,sim...","leveraging,set,relations,exact,set,similarity,..."
4,VLDB2011,SIGMOD2012a,0.375,"pass,join,partition,based,method,similarity,join","exploiting,mapreduce,based,similarity,join"
5,VLDB2018,SIGMOD2012a,0.375,"set,similarity,join,mapreduce,experimental,survey","exploiting,mapreduce,based,similarity,join"
8,SIGMOD2010a,ICDE2009,0.375,"efficient,parallel,set,similarity,joins,using,...","top-k,set,similarity,joins"
0,VLDB2012a,SIGMOD2010b,0.333333,"spatio,textual,similarity,join","probabilistic,string,similarity,join"
3,VLDB2017,VLDB2016b,0.333333,"leveraging,set,relations,exact,set,similarity,...","empirical,evaluation,set,similarity,join,techn..."
6,VLDB2018,VLDB2017,0.333333,"set,similarity,join,mapreduce,experimental,survey","leveraging,set,relations,exact,set,similarity,..."


# 2. Self Top K

## 2.a Load and optionally edit the config file

In [11]:
output_file = '../data/output/dblp_papers_selfclosestpairs_out.csv'

params['join_type'] = 'topk'
params['k'] = '5'
params['input_file'] = input_file
params['output_file'] = output_file
params['log_file'] = log_file
params['set_column'] = '1'
params['tokens_column'] = '2'

In [12]:
json.dump(params, open(config_file, 'w'), indent=4)

## 2.b Execute the join operation

In [13]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 18. Lines skipped due to errors: 0. Num of sets: 18. Elements per set: 6.222222222222222
Read time: 0.003324423 sec.
Transform time: 0.016882793 sec.
Collection size: 18
Join time: 0.020287217 sec.
Number of matches: 5


## 2.c Load the results

In [14]:
out2 = pd.read_csv(output_file, header=None, names=['Paper_ID_1', 'Paper_ID_2', 'Similarity'])
out2['Title_1'] = df.set_index('Paper_ID').Title[out2.Paper_ID_1].reset_index(drop=True)
out2['Title_2'] = df.set_index('Paper_ID').Title[out2.Paper_ID_2].reset_index(drop=True)
out2.sort_values('Similarity', ascending=False).head()

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
0,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
1,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
2,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
3,VLDB2016a,VLDB2017,0.4,"efficient,partition,based,method,exact,set,sim...","leveraging,set,relations,exact,set,similarity,..."
4,SIGMOD2010a,ICDE2009,0.375,"efficient,parallel,set,similarity,joins,using,...","top-k,set,similarity,joins"


In [15]:
out1.sort_values("Similarity", ascending=False).head(10)

Unnamed: 0,Paper_ID_1,Paper_ID_2,Similarity,Title_1,Title_2
10,VLDB2016a,VLDB2011,0.555556,"efficient,partition,based,method,exact,set,sim...","pass,join,partition,based,method,similarity,join"
1,VLDB2012b,VLDB2012a,0.5,"seal,spatio,textual,similarity,search","spatio,textual,similarity,join"
2,VLDB2014,SIGMOD2013,0.428571,"string,similarity,joins,experimental,evaluation","string,similarity,measures,joins,synonyms"
9,VLDB2016a,VLDB2017,0.4,"efficient,partition,based,method,exact,set,sim...","leveraging,set,relations,exact,set,similarity,..."
4,VLDB2011,SIGMOD2012a,0.375,"pass,join,partition,based,method,similarity,join","exploiting,mapreduce,based,similarity,join"
5,VLDB2018,SIGMOD2012a,0.375,"set,similarity,join,mapreduce,experimental,survey","exploiting,mapreduce,based,similarity,join"
8,SIGMOD2010a,ICDE2009,0.375,"efficient,parallel,set,similarity,joins,using,...","top-k,set,similarity,joins"
0,VLDB2012a,SIGMOD2010b,0.333333,"spatio,textual,similarity,join","probabilistic,string,similarity,join"
3,VLDB2017,VLDB2016b,0.333333,"leveraging,set,relations,exact,set,similarity,...","empirical,evaluation,set,similarity,join,techn..."
6,VLDB2018,VLDB2017,0.333333,"set,similarity,join,mapreduce,experimental,survey","leveraging,set,relations,exact,set,similarity,..."
