# Interacting with the SimSearch REST API

## Connection to an instance of the SimSearch service

In [None]:
import requests
import json
import pandas as pd  

# Use 5 decimal digits for floating numerical values
pd.options.display.float_format = '{:,.5f}'.format

In [None]:
# URL of the web service
# E.g., assuming that the SimSearch service has been deployed locally at port 8090:
serviceURL = 'http://localhost:8090/simsearch/api/'

### Mount request: Define data sources available for similarity search

In [None]:
# Specify a mount request to the SimSearch API that will index the data sources specified in the parameters
mount = serviceURL + 'index'

# JSON specification for the data sources and the similarity operations supported for their attributes
# In this example, note that the CSV dataset is available at a remote HTTP server
params = {'sources':[{'name':'remotePath1','type':'csv','url':'http://download.smartdatalake.eu/datasets/gdelt/'}], 'search':[{'operation':'spatial_knn','source':'remotePath1','dataset':'sample.csv','header':'true','separator':',','key_column':'article_id','search_column':['longitude','latitude']}, {'operation':'categorical_topk','source':'remotePath1','dataset':'sample.csv','separator':',','token_delimiter':';','header':'true','key_column':'article_id','search_column':'persons'}, {'operation':'numerical_topk','source':'remotePath1','dataset':'sample.csv','separator':',','header':'true','key_column':'article_id','search_column':'timestamp'}]}

# IMPORTANT! No API key is required for such requests
# A new API key will be generated once this request completes successfully
headers = {'Content-Type' : 'application/json'}

# Post this request with these parameters
response = requests.post(mount, json=params, headers=headers)

# Provide the resulting message (with the API key to be used in subsequent requests)
print(response.json())

### Remember your API key

In [None]:
# Copy below your API key obtained from the above request for further use with any other requests against this instance
API_KEY = ''

### Catalog request: List the queryable attributes

In [None]:
# Specify a catalog request to the SimSearch API
catalog = serviceURL + 'catalog'

# JSON specification may be empty in order to list all available attributes ...
params = {}

# ... or specify a particular type of similarity operation
#params= {'operation': 'numerical_topk'}

# API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
response = requests.post(catalog, json=params, headers=headers)
#print(response.json())

Report the queryable attributes and their supported similarity operations. Note that the spatial operation makes use of two attributes (lon, lat) available in the original dataset:

In [None]:
df = pd.DataFrame(response.json())
df

### Search request: top-k similarity search query

In [None]:
# Specify a search request to the SimSearch API
search = serviceURL + 'search'

# Count of top-k results to be returned
k = 20

# Rank aggregation method to be used; Possible values for the ranking algorithm: 'threshold' (default); 'no_random_access'"'; '"'partial_random_access'"'.
rankMethod = 'threshold'

# Query values for similarity search 
valKeywords = ['joe biden','donald trump']
valLocation = 'POINT(-74.94 42.15)'
valTimeStamp = '20191104084500'

# Specify all query parameters
# Note that multiple combinations of weights are specified per attribute -> In this example, two lists of top-k results will be computed
params = {'algorithm':rankMethod, 'k':k, 'queries':[{'column':'persons','value':valKeywords ,'weights':['1.0','0.8']}, {'column':'timestamp','value':valTimestamp,'weights':['1.0','0.4']}, {'column':['longitude','latitude'],'value':valLocation,'weights':['1.0','0.7']}]}
    
# Valid API key is required for such requests
headers = { 'api_key' : API_KEY, 'Content-Type' : 'application/json'}

# Post this request with these parameters to the SimSearch service; response is given in a JSON array
response = requests.post(search, json=params, headers=headers)
#print(response.json())

Report final ranked results: An array of top-k results is returned for each specified combination of weights.

For each combination, a similarity matrix is also returned, measuring the similarity between all pairs of the top-k results.

In [None]:
df = pd.DataFrame(response.json())
df

Get the top-k results for a given combination of weights:

In [None]:
# E.g., the 2nd combination of weights for the attributes are
weights = df['weights'].iloc[1]
print(weights)

# And the topk-k results corresponding to the 2nd combination of weights
results = df['rankedResults'].iloc[1]
dfResults = pd.json_normalize(results)
dfResults

Flatten attribute values and scores contained in the nested array:

In [None]:
# Isolate these properties (attribute name, its value for each result, and its similarity score) contained in the nested array
dfAttr = pd.json_normalize(results, 'attributes', ['id'])

# Also, replace the name of the spatial attribute in case that it consists of two original attributes (e.g., longitude, latitude)
dfAttr['name'].replace('[longitude, latitude]', 'location', inplace = True)
dfAttrFlat = dfAttr.set_index(['id','name']).unstack()
dfAttrFlat.columns = [f'{j}_{i}' for i, j in dfAttrFlat.columns]
dfAttrFlat.reset_index()

# Rest of the properties in the results
dfRest = dfResults.loc[:, ['id','score','rank','exact']]

# Merge with the flattened attribute properties
# This provides all infomation about this batch of top-k results
dfFinal = pd.merge(dfRest, dfAttrFlat, on=['id','id'])

# Top-k results in a flattened data frame available for post-processing (e.g., map visualization, statistics, etc.)
dfFinal