## Imports

In [1]:
import pandas as pd
from pyjavaproperties import Properties

## Settings to enable real-time output from a shell command

In [2]:
from subprocess import Popen, PIPE, STDOUT
from IPython.core.magic import register_line_magic

@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()

# 1. Self Join

## 1.a Preview the input file

In [3]:
df = pd.read_csv('../data/dblp_authors_papers_test.csv', sep=';', header=None, names=['Authors','Paper_ID','Title'])
df.sort_values('Authors').head(100)

Unnamed: 0,Authors,Paper_ID,Title
33,Chen+Li,SIGMOD2010a,"efficient,parallel,set,similarity,joins,using,..."
39,Chen+Li,SIGMOD2013,"string,similarity,measures,joins,synonyms"
41,Chuan+Xiao,ICDE2009,"top-k,set,similarity,joins"
65,Chuitian+Rong,ICDE2017,"fast,scalable,distributed,set,similarity,join,..."
66,Chunbin+Lin,ICDE2017,"fast,scalable,distributed,set,similarity,join,..."
37,Chunbin+Lin,SIGMOD2013,"string,similarity,measures,joins,synonyms"
0,Dong+Deng,SIGMOD2014,"pivotal,prefix,based,filtering,algorithm,strin..."
3,Dong+Deng,ICDE2013,"top-k,string,similarity,search,edit,distance,c..."
7,Dong+Deng,VLDB2016a,"efficient,partition,based,method,exact,set,sim..."
25,Dong+Deng,VLDB2011,"pass,join,partition,based,method,similarity,join"


## 1.b Load and optionally edit the config file

In [4]:
config_file_example = '../config.properties.example'
config_file = '../config.properties'
input_file = '../data/dblp_authors_papers_test.csv'
output_file = '../data/output/dblp_authors_papers_selfjoin_out.csv'
stats_file = '../data/output/stats.csv'

In [5]:
params = Properties()
params.load(open(config_file_example))
params.list()

-- listing properties --
mode=standard
operation=search
query_file=data/test_input_R.csv
input_file=data/test_input_S.csv
query_id=0
max_lines=-1
fuzzyset_column=1
set_column=2
tokens_column=3
column_delimiter=;
token_delimiter=,
header=false
return_counts=true
output_file=out.txt
stats_file=stats.txt
sim_threshold=0.8
k=3


In [6]:
params['mode'] = 'fuzzy'
params['operation'] = 'self-join'
params['sim_threshold'] = '0.33'
params['input_file'] = input_file
params['output_file'] = output_file
params['stats_file'] = stats_file
params['fuzzyset_column'] = '1'
params['set_column'] = '2'
params['tokens_column'] = '3'
params['return_counts'] = 'false'

In [7]:
params.store(open(config_file, 'w'))

## 1.c Execute the join operation

In [8]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 71. Lines skipped due to errors: 0. Num of sets: 45. Elements per set: 1.5777777777777777
Transformation Time: 0.002355695 sec.
Indexing Time: 0.009736258 sec.

Total Join Time: 0.179284877 sec.
Signature Generation Time: 0.011779396 sec.
Check Filter Time: 0.009757591 sec.
NN Filter Time: 0.002340015 sec.
Verification Time: 0.14507989 sec.

Check Filter Candidates: 971
NN Filter Candidates: 356

Total Matches: 219


## 1.c Load the results

In [9]:
out1 = pd.read_csv(output_file, header=None, names=['Author_ID_1', 'Author_ID_2', 'Similarity'])
out1.sort_values(['Similarity'], ascending=False).head(150)

Unnamed: 0,Author_ID_1,Author_ID_2,Similarity
0,Wei+Lu,Jianguo+Wang,1.000000
20,Ulf+Leser,Johann+Christoph+Freytag,1.000000
27,Ke+Yi,Jeffrey+Jestes,1.000000
29,Jianguo+Wang,Chuitian+Rong,1.000000
30,Jian+Li,Jian+He,1.000000
32,Shen+Ge,Nikos+Mamoulis,1.000000
33,Lizhu+Zhou,Ju+Fan,1.000000
39,Wei+Lu,Chuitian+Rong,1.000000
41,Rares+Vernica,Michael+J+Carey,1.000000
47,Xiaoyong+Du,Jianguo+Wang,1.000000
