# Multi-Attribute Similarity Search for Interactive Data Exploration with the SimSearch REST API

## Connection to an instance of the SimSearch service

In [None]:
import requests
import json
import pandas as pd  
import numpy as np
import math
import scipy.stats
from matplotlib import pyplot as plt

from IPython.core.display import display, HTML

# Various custom helper functions
from functions import results_pairwise, flatten, changeDataType, map_points, weighted_heatmap, filterNaN, filter_dict_median, frequency, barchart, plot_wordcloud, generate_color

# Use 5 decimal digits for floating numerical values
pd.options.display.float_format = '{:,.5f}'.format

In [None]:
# URL of the web service
# E.g., assuming that the SimSearch service has been deployed locally at port 8090:
serviceURL = 'http://localhost:8090/simsearch/api/'

### __*Mount request*__: Define data sources available for similarity search

#####  __IMPORTANT!__ This step needs to be performed __once__ for each data source. 
##### Once data is successfully ingested, it can be queried as long as the SimSearch service is up-and-running.

In [None]:
# Specify a mount request to the SimSearch API that will index the data sources specified in the parameters
mount = serviceURL + 'index'

# JSON specification for the data sources and the similarity operations supported for their attributes
# In this example, note that the CSV dataset is available at a REMOTE HTTP server; however, data may be also available at the server's file system
# The spatial operation makes use of two attributes (longitude, latitude) available in the original dataset, but it is mentioned with an alias ('position'):
params = {'sources':[{'name':'remotePath1','type':'csv','url':'http://download.smartdatalake.eu/datasets/gdelt/'}], 'search':[{'operation':'spatial_knn','source':'remotePath1','dataset':'sample.csv','header':'true','separator':',','key_column':'article_id','search_column':['longitude','latitude'],'alias_column':'position'}, {'operation':'categorical_topk','source':'remotePath1','dataset':'sample.csv','separator':',','token_delimiter':';','header':'true','key_column':'article_id','search_column':'persons'}, {'operation':'temporal_topk','source':'remotePath1','dataset':'sample.csv','separator':',','header':'true','key_column':'article_id','search_column':'timestamp'}]}

# IMPORTANT! No API key is required for such requests
# A new API key will be generated once this request completes successfully
headers = {'Content-Type' : 'application/json'}

# Post this request with these parameters
resMount = requests.post(mount, json=params, headers=headers)

# Provide the resulting message (with the API key to be used in subsequent requests)
print(resMount.json())

#### __IMPORTANT!__ Remember your API key for subsequent requests to this SimSearch instance

##### Create a dictionary from the response ...

In [None]:
dictMount = json.loads(resMount.text)

##### ... and extract the API key necessary for connecting to the SimSearch instance:

In [None]:
# Keep your API key obtained from the mount request for further use with any other requests against this instance
API_KEY = dictMount.get('apiKey')
API_KEY

### __*Append request*__: Include extra attributes to this SimSearch instance

#### Specify the dataset, the attributes and the respective similarity operations:

In [None]:
# Specify an append request to the SimSearch API that will also index the data sources specified in the parameters
mount = serviceURL + 'append'

# JSON specification for the data source(s) and the similarity operations supported for their attributes
# In this example, note that the CSV dataset must be available at the local file system (in the server)
params = {'sources':[{'name':'localPath1','type':'csv','directory':'/data/gdelt/}], 'search':[{'operation':'numerical_topk','source':'localPath1','dataset':'sample.csv','separator':',','header':'true','key_column':'article_id','search_column':'positive_sentiment'}, {'operation':'numerical_topk','source':'localPath1','dataset':'sample.csv','separator':',','header':'true','key_column':'article_id','search_column':'negative_sentiment'}]}

# IMPORTANT! API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters
resAppend = requests.post(mount, json=params, headers=headers)

# Provide the resulting message (with the API key to be used in subsequent requests)
print(resAppend.json())


### __*Catalog request*__: List all queryable attributes

In [None]:
# Specify a catalog request to the SimSearch API
catalog = serviceURL + 'catalog'

# JSON specification may be empty in order to list all available attributes ...
params = {}

# ... or specify a particular type of similarity operation
#params= {'operation': 'numerical_topk'}

# API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
response = requests.post(catalog, json=params, headers=headers)
#print(response.json())

Report the queryable attributes, their data types, and their supported similarity operations:

In [None]:
attrs = pd.DataFrame(response.json())
attrs

### __*Search request*__: submit a top-*k* similarity search query

#### Top-k value

In [None]:
# Desired number of top-k results to return
k = 30

#### Query values per attribute involved in this search request:

In [None]:
# Each query value should conform with the data type of the respective attribute
valKeywords = ['donald trump', 'joe biden', 'vladimir putin']
valTimestamp = '2019-07-14 12:45:00'
valPosSentiment = '2.5'
valPosition = 'POINT(2.35 48.85)'

#### Weight specifications

In [None]:
# Note that multiple combinations of weights are specified per attribute; In this example, two lists of top-k results will be returned
weightKeywords = ['1.0','0.8']
weightTimestamp = ['1.0','0.9']
weightPosSentiment = ['1.0','0.3']
weightPosition = ['1.0','0.6']

#### Rank method to apply for similarity search

In [None]:
# Possible values for the ranking algorithm: 'threshold' (default); 'no_random_access; 'partial_random_access'; 'pivot_based'.
rankMethod = 'threshold'

#### Compose & submit this search request

In [None]:
# Specify a search request to the SimSearch API
search = serviceURL + 'search'

# Specify all query parameters
# Can also specify extra attributes (not involved in similarity conditions) to be included in the list of query results
params = {'algorithm':rankMethod, 'output': {'extra_columns':['negative_sentiment','name']}, 'k':k, 'queries':[{'column':'persons','value':valKeywords ,'weights':weightKeywords}, {'column':'positive_sentiment','value':valPosSentiment ,'weights':weightPosSentiment}, {'column':'timestamp','value':valTimestamp,'weights':weightTimestamp}, {'column':'position','value':valPosition,'weights':weightPosition}]}

# Valid API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
resSearch = requests.post(search, json=params, headers=headers)
#print(resSearch.json())

Report final ranked results: An array of top-k results is returned for each specified combination of weights.
For each combination, a similarity matrix is also returned, measuring the similarity between all pairs of the top-k results.

In [None]:
df = pd.DataFrame(resSearch.json())
df

Print a given combination of weights:

In [None]:
weights = df['weights']
# E.g., the ***2nd*** combination of weights for the attributes
print(weights.iloc[1])

### Top-*k* results for each combination of weights
Also flatten attribute values and scores contained in the nested JSON array returned as response:

In [None]:
results = [None] * len(weights)

# Results for each combination  of weights
# This flattening returns geodataframes, i.e., one column holds geometries (point locations)
for index, item in enumerate(weights):
    results[index] = flatten(df['rankedResults'].iloc[index])

#### Listing of results for a given batch

In [None]:
# Display the table as HTML with clickable URLs
display(HTML(results[1].to_html(render_links=True,escape=False)))

# Results for the 1st combination of weights
#results[1]

### Intra-Correlation: Similarity of the results for a given combination of weights

In [None]:
# Create as many plots as the weight combinations
fig, ax = plt.subplots(1,len(weights),figsize=(10,10))

simMatrix = [None] * len(weights)
# Create a pivot table for the similarity matrix of each weight combination and plot it
for index, item in enumerate(weights):
    plt.subplot(1, len(weights), index+1)
    sim = pd.DataFrame(df['similarityMatrix'].iloc[index])
    simMatrix[index] = sim.pivot(index='left', columns='right', values='score')
    plt.imshow(simMatrix[index], interpolation='none')
    plt.title('W' + str(index+1))

### Inter-Correlation: Statistics regarding pairwise correlation of results

##### First, create lists of rankings for two batches of results (i.e., from two combinations of weights)

In [None]:
# E.g., A is the second and B is the fourth batch of SimSearch results
A, B = results_pairwise(results[0], results[1])

##### Pearson's:

In [None]:
scipy.stats.pearsonr(A.values[0], B.values[0]) 

##### Spearman's rho:

In [None]:
scipy.stats.spearmanr(A.values[0], B.values[0])   

##### Kendall's tau:

In [None]:
scipy.stats.kendalltau(A, B)  

## Map visualizations

#### Map plots from each batch based on the spatial locations

In [None]:
listMapPoints = []    # clustered points with a BBOX showing the overall spatial extent
listHeatmaps = []     # heatmaps illutrating the spatial density

# Create all map plots from each batch of results (weight combinations)
for index, item in enumerate(weights):
    listMapPoints.append(map_points(results[index], show_bbox=True))
    listHeatmaps.append(weighted_heatmap(results[index], radius=20))

### Plot clustered points for each batch of results

In [None]:
contents = ''
numPlots = sum(m is not None for m in listMapPoints)
percent = (100.0/numPlots) - 0.5

# Construct an HTML for displaying maps side-by-side
for m in listMapPoints:
    if m is not None:
        contents += '<iframe srcdoc="{}" style="float:left; width: {}px; height: {}px; display:inline-block; width: {}%; margin: 0 auto; border: 2px solid black"></iframe>'.format(m.get_root().render().replace('"', '&quot;'),400,400,percent)

display(HTML(contents))

### Plot heatmaps for each batch of results

In [None]:
contents = ''
numPlots = sum(m is not None for m in listHeatmaps)
percent = (100.0/numPlots) - 0.5

# Construct an HTML for displaying heatmaps side-by-side
for m in listHeatmaps:
    if m is not None:
        contents += '<iframe srcdoc="{}" style="float:left; width: {}px; height: {}px; display:inline-block; width: {}%; margin: 0 auto; border: 2px solid black"></iframe>'.format(m.get_root().render().replace('"', '&quot;'),400,400,percent)

display(HTML(contents))

##### Display maps of clustered points side-by-side

## Keyword visualizations

##### **IMPORTANT!** First, specify the attribute that contains _keywords_, required in creating workclouds:

In [None]:
attrKeywords = 'persons_value'

#### Top-10 keywords per batch of results

In [None]:
for index, item in enumerate(weights):
    # Use only those keywords above the median frequency for each batch
    kf = filter_dict_median(frequency(results[index],attrKeywords))
    # Create barchart
    barchart(kf, plot_width=4, plot_height=3, orientation='Horizontal', plot_title='keywords for W'+str(index+1), x_axis_label='Frequency', top_k=10)

### A word cloud per batch of results

In [None]:
plot_wordcloud(results[0], attrKeywords)

plot_wordcloud(results[1], attrKeywords)

## Visualizations for numerical attributes

### Histograms to display distribution of values for numerical attributes

##### **IMPORTANT!** First, specify the attribute that contains _numerical_ values

In [None]:
# Specify the attribute containing the numerical values of interest in the response
attrNumeric = 'positive_sentiment_value'

#### Frequency histograms

In [None]:
dfNumerical = [None] * len(weights)
dfBins = [None] * len(weights)
numBins = 20  # fixed number of bins

# Create as many plots as the weight combinations
fig, ax = plt.subplots(1,len(weights))

# Figure size per histogram
fig.set_figheight(3) # optional setting the height of the image
fig.set_figwidth(16) # optional setting the width of the image

# Create histogram from numerical data values for each combination  of weights
for index, item in enumerate(weights):
    dfNumerical[index] =  results[index][attrNumeric] #pd.to_numeric(results[index][attrNumeric], errors='coerce')
    bins = np.linspace(math.ceil(min(dfNumerical[index])), math.floor(max(dfNumerical[index])), numBins) 
    label = ' '.join(str(weights[index]))
    ax[index].hist(dfNumerical[index], bins=dfBins[index], alpha = 0.8) #, color = generate_color(weights[index]))
    ax[index].set(title='W'+str(index+1), ylabel='Frequency', xlabel=attrNumeric)

plt.show()

#### Boxplots to show the mean value and the distribution of values per batch

In [None]:
fig, ax = plt.subplots()

box_plot_data=[filterNaN(results[0][attrNumeric]),filterNaN(results[1][attrNumeric])]
ax.boxplot(box_plot_data)

# Custom ticks
plt.xticks([1, 2], ['W1', 'W2'])

plt.gca().set(title='Distribution per Weight combination', ylabel=attrNumeric)
ax.set_yscale('log')

plt.show()

### Plot distribution on date/time attribute

##### **IMPORTANT!** First, specify the date/time attribute of interest:

In [None]:
attrTemporal = 'timestamp_value'

#### Frequency histograms on MONTH (extracted from timestamp values)

In [None]:
dfTemporal = [None] * len(weights)

# Create as many plots as the weight combinations
fig, ax = plt.subplots(1,len(weights))

# Figure size per histogram
fig.set_figheight(3) # optional setting the height of the image
fig.set_figwidth(16) # optional setting the width of the image

# Plot aggregate values per MONTH for each combination  of weights
for index, item in enumerate(weights):
    dfTemporal[index] =  results[index][attrTemporal]
    dfTemporal[index].groupby(dfTemporal[index].dt.month).count().plot.bar(ax=ax[index])
    ax[index].set(title='W'+str(index+1), ylabel='Frequency', xlabel='Month')