# Multi-Attribute Similarity Search for Interactive Data Exploration with the SimSearch REST API

## Connection to an instance of the SimSearch service

In [None]:
import requests
import json
import pandas as pd  
import numpy as np
import math
from matplotlib import pyplot as plt

from IPython.core.display import display, HTML

# Various custom helper functions
from functions import results_pairwise, flatten, changeDataType, map_points, filterNaN, filter_dict_median, frequency, barchart, plot_wordcloud, generate_color

# Use 5 decimal digits for floating numerical values
pd.options.display.float_format = '{:,.5f}'.format

In [None]:
# URL of the web service
# E.g., assuming that the SimSearch service has been deployed locally at port 8090:
serviceURL = 'http://localhost:8090/simsearch/api/'

### __*Mount request*__: Define data sources available for similarity search

#####  __IMPORTANT!__ This step needs to be performed __once__ for each data source. 
##### Once data is successfully ingested, it can be queried as long as the SimSearch service is up-and-running.

In [None]:
# Specify a mount request to the SimSearch API that will index the data sources specified in the parameters
mount = serviceURL + 'index'

# JSON specification for the data sources and the similarity operations supported for their attributes
# In this example, note that the CSV dataset is available at a remote HTTP server
params = {'sources':[{'name':'remotePath1','type':'csv','url':'http://download.smartdatalake.eu/datasets/gdelt/'}], 'search':[{'operation':'spatial_knn','source':'remotePath1','dataset':'sample.csv','header':'true','separator':',','key_column':'article_id','search_column':['longitude','latitude']}, {'operation':'categorical_topk','source':'remotePath1','dataset':'sample.csv','separator':',','token_delimiter':';','header':'true','key_column':'article_id','search_column':'persons'}, {'operation':'numerical_topk','source':'remotePath1','dataset':'sample.csv','separator':',','header':'true','key_column':'article_id','search_column':'timestamp'}]}

# IMPORTANT! No API key is required for such requests
# A new API key will be generated once this request completes successfully
headers = {'Content-Type' : 'application/json'}

# Post this request with these parameters
response = requests.post(mount, json=params, headers=headers)

# Provide the resulting message (with the API key to be used in subsequent requests)
print(response.json())

#### __IMPORTANT!__ Remember your API key

In [None]:
# Copy below your API key obtained from the above request for further use with any other requests against this instance
API_KEY = ''

### __*Catalog request*__: List the queryable attributes

In [None]:
# Specify a catalog request to the SimSearch API
catalog = serviceURL + 'catalog'

# JSON specification may be empty in order to list all available attributes ...
params = {}

# ... or specify a particular type of similarity operation
#params= {'operation': 'numerical_topk'}

# API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
response = requests.post(catalog, json=params, headers=headers)
#print(response.json())

Report the queryable attributes and their supported similarity operations. Note that the spatial operation makes use of two attributes (lon, lat) available in the original dataset:

In [None]:
attrs = pd.DataFrame(response.json())
attrs

### __*Search request*__: submit a top-*k* similarity search query

In [None]:
# Specify a search request to the SimSearch API
search = serviceURL + 'search'

# Count of top-k results to be returned
k = 30

# Rank aggregation method to be used; Possible values for the ranking algorithm: 'threshold' (default); 'no_random_access'"'; '"'partial_random_access'"'.
rankMethod = 'threshold'

# Query values for similarity search 
valKeywords = ['donald trump', 'joe biden', 'vladimir putin']
valLocation = 'POINT(-74.94 42.15)'
valTimestamp = 20191104084500

# Specify all query parameters
# Note that multiple combinations of weights are specified per attribute -> In this example, two lists of top-k results will be computed
params = {'algorithm':rankMethod, 'k':k, 'queries':[{'column':'persons','value':valKeywords ,'weights':['1.0','0.8']}, {'column':'timestamp','value':valTimestamp,'weights':['1.0','0.4']}, {'column':['longitude','latitude'],'value':valLocation,'weights':['1.0','0.7']}]}

# Valid API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
response = requests.post(search, json=params, headers=headers)
#print(response.json())

Report final ranked results: An array of top-k results is returned for each specified combination of weights.
For each combination, a similarity matrix is also returned, measuring the similarity between all pairs of the top-k results.

In [None]:
df = pd.DataFrame(response.json())
df

Print a given combination of weights:

In [None]:
weights = df['weights']
# E.g., the ***2nd*** combination of weights for the attributes
print(weights.iloc[1])

### Top-*k* results for each combination of weights
Also flatten attribute values and scores contained in the nested JSON array returned as response:

In [None]:
results = [None] * len(weights)

# Results for each combination  of weights
# This flattening returns geodataframes, i.e., one column holds geometries (point locations)
for index, item in enumerate(weights):
    results[index] = flatten(df['rankedResults'].iloc[index])

#### Listing of results for a given batch

In [None]:
# Display the table as HTML with clickable URLs
display(HTML(results[0].to_html(render_links=True,escape=False)))

# Results for the 1st combination of weights
#results[0]

### Intra-Correlation: Similarity of the results for a given combination of weights

In [None]:
# Create as many plots as the weight combinations
fig, ax = plt.subplots(1,len(weights),figsize=(10,10))

simMatrix = [None] * len(weights)
# Create a pivot table for the similarity matrix of each weight combination and plot it
for index, item in enumerate(weights):
    plt.subplot(1, len(weights), index+1)
    sim = pd.DataFrame(df['similarityMatrix'].iloc[index])
    simMatrix[index] = sim.pivot(index='left', columns='right', values='score')
    plt.imshow(simMatrix[index], interpolation='none')
    plt.title('W ' + str(weights[index]))


### Inter-Correlation: Statistics regarding pairwise correlation of results

##### First, create lists of rankings for two batches of results (i.e., from two combinations of weights)

In [None]:
import scipy.stats

# E.g., A is the second and B is the fourth batch of SimSearch results
A, B = results_pairwise(results[0], results[1])

##### Pearson's:

In [None]:
scipy.stats.pearsonr(A.values[0], B.values[0]) 

##### Spearman's rho:

In [None]:
scipy.stats.spearmanr(A.values[0], B.values[0])   

##### Kendall's tau:

In [None]:
scipy.stats.kendalltau(A, B)  

## Map visualizations

#### Map plots from each batch based on the spatial locations

In [None]:
m0 = map_points(results[0], show_bbox=True)
m1 = map_points(results[1], show_bbox=True)

##### Display maps of clustered points side-by-side

In [None]:
htmlmap1 = HTML('<iframe srcdoc="{}" style="float:left; width: {}px; height: {}px; display:inline-block; width: 48.5%; margin: 0 auto; border: 2px solid black"></iframe>'
           '<iframe srcdoc="{}" style="float:right; width: {}px; height: {}px; display:inline-block; width: 48.5%; margin: 0 auto; border: 2px solid black"></iframe>'
           .format(m0.get_root().render().replace('"', '&quot;'),400,400,
                   m1.get_root().render().replace('"', '&quot;'),400,400
                  ))

display(htmlmap1)

## Keyword visualizations

#### Top-10 keywords per batch of results

In [None]:
# Specify the attribute containing keywords in the response
col_keywords = 'persons_value'

for index, item in enumerate(weights):
    # Use only those keywords above the median frequency for each batch
    kf = filter_dict_median(frequency(results[index],col_keywords))
    # Create barchart
    barchart(kf, plot_width=4, plot_height=3, orientation='Horizontal', plot_title='keywords for W '+' '.join(str(weights[index])), x_axis_label='Frequency', top_k=10)


### A word cloud per batch of results

In [None]:
plot_wordcloud(results[0], col_keywords)

plot_wordcloud(results[1], col_keywords)

## Visualizations for numerical attributes

#### Distribution of numerical values for a given attribute using histograms

In [None]:
# Specify the attribute containing the numerical values of interest in the response
col_Numerical = 'timestamp_value'

dfNumerical = [None] * len(weights)
dfBins = [None] * len(weights)
numBins = 20  # fixed number of bins

# Create as many plots as the weight combinations
fig, ax = plt.subplots(1,len(weights))

# Figure size per histogram
fig.set_figheight(3) # optional setting the height of the image
fig.set_figwidth(16) # optional setting the width of the image

# Create histogram from numerical data values for each combination  of weights
for index, item in enumerate(weights):
    dfNumerical[index] =  pd.to_numeric(results[index][col_Numerical], errors='coerce')
    bins = np.linspace(math.ceil(min(dfNumerical[index])), math.floor(max(dfNumerical[index])), numBins) 
    label = ' '.join(str(weights[index]))
    ax[index].hist(dfNumerical[index], bins=dfBins[index], alpha = 0.8, color = generate_color(weights[index]))
    ax[index].set(title='W '+label, ylabel='Frequency')

plt.show()

#### Boxplots to show the mean value and the distribution of values per batch

In [None]:
fig, ax = plt.subplots()

box_plot_data=[filterNaN(results[0][col_Numerical]),filterNaN(results[1][col_Numerical])]
ax.boxplot(box_plot_data)

# Custom ticks
plt.xticks([1, 2], ['W1', 'W2'])

plt.gca().set(title='Distribution per Weight combination', ylabel=col_Numerical)
ax.set_yscale('log')

plt.show()