## Imports

In [1]:
import pandas as pd
import json

## Settings to enable real-time output from a shell command

In [2]:
from subprocess import Popen, PIPE, STDOUT
from IPython.core.magic import register_line_magic

@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()

# 1. Self Join

## 1.a Preview the input file

In [3]:
df = pd.read_csv('../data/dblp_authors_papers_test.csv', sep=';', header=None, names=['Authors','Paper_ID','Title'])
df.sort_values('Authors').head(100)

Unnamed: 0,Authors,Paper_ID,Title
33,Chen+Li,SIGMOD2010a,"efficient,parallel,set,similarity,joins,using,..."
39,Chen+Li,SIGMOD2013,"string,similarity,measures,joins,synonyms"
41,Chuan+Xiao,ICDE2009,"top-k,set,similarity,joins"
65,Chuitian+Rong,ICDE2017,"fast,scalable,distributed,set,similarity,join,..."
66,Chunbin+Lin,ICDE2017,"fast,scalable,distributed,set,similarity,join,..."
37,Chunbin+Lin,SIGMOD2013,"string,similarity,measures,joins,synonyms"
0,Dong+Deng,SIGMOD2014,"pivotal,prefix,based,filtering,algorithm,strin..."
3,Dong+Deng,ICDE2013,"top-k,string,similarity,search,edit,distance,c..."
7,Dong+Deng,VLDB2016a,"efficient,partition,based,method,exact,set,sim..."
25,Dong+Deng,VLDB2011,"pass,join,partition,based,method,similarity,join"


## 1.b Load and optionally edit the config file

In [4]:
config_file_example = '../config.json.example'
config_file = '../config.json'
input_file = '../data/dblp_authors_papers_test.csv'
output_file = '../data/output/dblp_authors_papers_selfjoin_out.csv'
log_file = '../data/output/log.txt'

In [5]:
params = json.load(open(config_file_example))
params

{'query_file': 'data/test_input_R.csv',
 'input_file': 'data/test_input_S.csv',
 'max_lines': '-1',
 'set_column': '1',
 'elements_column': '2',
 'tokens_column': '3',
 'column_delimiter': ';',
 'token_delimiter': ',',
 'header': 'false',
 'output_file': 'out.txt',
 'mode': 'standard',
 'join_type': 'threshold',
 'threshold': '0.5',
 'k': '3'}

In [6]:
params['mode'] = 'fuzzy'
params['join_type'] = 'threshold'
params['threshold'] = '0.33'
del(params['query_file'])
params['input_file'] = input_file
params['output_file'] = output_file
params['log_file'] = log_file
params['set_column'] = '1'
params['elements_column'] = '2'
params['tokens_column'] = '3'

In [7]:
json.dump(params, open(config_file, 'w'), indent=4)

## 1.c Execute the join operation

In [8]:
%runrealcmd java -jar ../target/simjoin-0.0.1-SNAPSHOT-jar-with-dependencies.jar $config_file

Finished reading file. Lines read: 71. Lines skipped due to errors: 0. Num of sets: 45. Elements per set: 1.5777777777777777. Tokens per Element: 6.23943661971831
Read time: 0.008667764 sec.
Transform time: 0.010144087 sec.
Collection size: 45
Indexing time: 0.007271427 sec.
Join time: 0.212302917 sec. 0m 0s
Number of matches: 87


## 1.c Load the results

In [9]:
out1 = pd.read_csv(output_file, header=None, names=['Author_ID_1', 'Author_ID_2', 'Similarity'])
out1.sort_values(['Similarity'], ascending=False).head(150)

Unnamed: 0,Author_ID_1,Author_ID_2,Similarity
0,Jian+He,Jian+Li,1.000000
32,Jeffrey+Jestes,Feifei+Li,1.000000
30,Jeffrey+Jestes,Ke+Yi,1.000000
33,Feifei+Li,Ke+Yi,1.000000
34,Feifei+Li,Zhepeng+Yan,1.000000
27,Chuan+Xiao,Haichuan+Shang,1.000000
35,Zhepeng+Yan,Ke+Yi,1.000000
41,Shen+Ge,Nikos+Mamoulis,1.000000
47,Ju+Fan,Jun+Hu,1.000000
48,Ju+Fan,Shanshan+Chen,1.000000
