# Purpose of this notebook



In [1]:
# prompt: using the subprocess module, run pip install -r on the file at https://raw.githubusercontent.com/rdhyee/isamples-examples/exploratory/requirements.in

import subprocess

def in_colab():
    try:
        from IPython.core import getipython
        return 'google.colab' in str(getipython.get_ipython())
    except ImportError:
        # Not running in an IPython environment
        return False


if in_colab():
  subprocess.run(['pip', 'install', '-r', 'https://raw.githubusercontent.com/rdhyee/isamples-examples/exploratory/requirements.in'])

In [None]:
# create pybash macro
# https://stackoverflow.com/a/67029719/7782
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()

@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [None]:
import json
import logging
import httpx
import xarray
import pysolr
import multidict
from datetime import datetime


from urllib.parse import quote

import pandas as pd
from pandas import DataFrame, Series
import numpy as np

import matplotlib.pyplot as plt

from collections import Counter
from isbclient import IsbClient, MAJOR_FIELDS, FL_DEFAULT, FACET_FIELDS_DEFAULT, FACET_RANGE_FIELDS_DEFAULT, ISAMPLES_SOURCES

# creating a subclass of IsbClient because we're still working out the best ways to interact with the API
from isbclient import IsbClient2

from isbclient import format_date_for_solr, create_date_range_query, filter_null_values
from isbclient import monkey_patch_select, SWITCH_TO_POST


from itertools import islice

logging.getLogger().setLevel(logging.INFO)

# monkeypatch pysolr?
monkey_patch_select(active=True)
SWITCH_TO_POST = 10000


# The overall iSamples API

* https://central.isample.xyz/isamples_central/docs is the swagger UI
* https://central.isample.xyz/isamples_central/openapi.json is the OpenAPI spec file for the iSamples API.

There are Python libraries for enabling devs to interact with an API specified by an OpenAPI spec, but my current thought is that they don't make life any easier than to work with pieces of the API by hand.


In [None]:
# https://central.isample.xyz/isamples_central/openapi.json is an OPENAPI 3.x spec

OPENAPI_URL = 'https://central.isample.xyz/isamples_central/openapi.json'
r = httpx.get(OPENAPI_URL)
r.json()['paths'].keys()

# /thing/select: Solr-based select interface

In [None]:
# focus on /thing/select endpoint
r = httpx.get(OPENAPI_URL)
r.json()['paths']['/thing/select']['get']

# documentation about Solr query language

[The Standard Query Parser | Apache Solr Reference Guide 8.11](https://solr.apache.org/guide/8_11/the-standard-query-parser.html#standard-query-parser-parameters):

> Solr’s default Query Parser is also known as the “lucene” parser.   
> [....]   
> q: Defines a query using standard query syntax. This parameter is mandatory

Note [Differences between Lucene’s Classic Query Parser and Solr’s Standard Query Parser](https://solr.apache.org/guide/8_11/the-standard-query-parser.html#differences-between-lucenes-classic-query-parser-and-solrs-standard-query-parser)

there are "existence searches" [The Standard Query Parser | Apache Solr Reference Guide 8.11](https://solr.apache.org/guide/8_11/the-standard-query-parser.html#existence-searches):

> An existence search for a field matches all documents where a value exists for that field. To query for a field existing, simply use a wildcard instead of a term in the search.
>
> field:*
>
> A field will be considered to "exist" if it has any value, even values which are often considered "not existent". (e.g., NaN, "", etc.)

Good tutorial on the query syntax of Solr (apart from the official documentation): [Solr Query Syntax and Examples](https://yonik.com/solr/query-syntax/)


## why the action is on fq and not q

We set `q=*:*` and vary `fq`.  By doing so, you can cache results by varying `fq`. Also changing `fq` doesn't change the score.  (A better explanation should be put here because the distinction between `q` and `fq` is something that is not obvious to people new to Solr. ([Difference between q and fq in Solr - Stack Overflow](https://stackoverflow.com/questions/20988516/difference-between-q-and-fq-in-solr))

Translate the UI from https://central.isample.xyz/isamples_central/ui into widgetized forms to formulate query

* display number of hits
* display facets

map
dataframe


# IsbClient2

In [None]:
cli = IsbClient2()

# get OpenContext sourced records
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',), collection_date_end=str(datetime.now().year))

params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

In [None]:
params

In [None]:
cli.facets(params=params)

In [None]:
# use the /thing/select endpoint directly
response = cli.search(params=params, thingselect=True)
# print number of hits
print (response['response']['numFound'])

df = DataFrame(response)
df.head()

In [None]:
df

In [None]:
# what's the number of records that are geocoded in OpenContext

import multidict 

# get OpenContext sourced records
# fq=-lat:[* TO *] AND -long:[* TO *]&rows=0
# fq = cli._fq_from_kwargs(source=('OPENCONTEXT',), collection_date_end=str(datetime.now().year))
geodict = multidict.MultiDict({
  '-producedBy_samplingSite_location_latitude':'[* TO *]', 
  '-producedBy_samplingSite_location_longitude': '[* TO *]'
})

fq = cli._fq_from_kwargs(source=('OPENCONTEXT',), collection_date_end=str(datetime.now().year), 
        _multi=geodict )

params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

# use the /thing/select endpoint directly
response = cli.search(params=params, thingselect=True)
# print number of hits
print (response['response']['numFound'])
results = islice(response, 300)

In [None]:
cli._fq_from_kwargs(source=('OPENCONTEXT',), collection_date_end=str(datetime.now().year), 
        producedBy_samplingSite_location_latitude='[* TO *]', producedBy_samplingSite_location_longitude='[* TO *]' )

In [None]:
logging.getLogger().setLevel(logging.CRITICAL)

# monkeypatch pysolr?
monkey_patch_select(active=True)
SWITCH_TO_POST = 100000

cli = IsbClient2()
# build fq: OpenContext source and search for bone
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',), searchText="bone")
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

# use pysolr to get the results
response = cli.search(params=params)
# print number of hits
print (len(response))
results = islice(response, 1000)

df = DataFrame(results)
df.head(2)

In [None]:
cli.record_count(params=params)

In [None]:
# goal: figure out how to get facet counts and pivoting

cli = IsbClient2()
# build fq: OpenContext source and search for bone
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',), searchText="bone")
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)

In [None]:
resp0 = cli.search(params=params, thingselect=True)
resp0.get("facet_counts",{}).get("facet_fields",{}).keys() #.get(field, [])

In [None]:
type(query)

In [None]:
# let's look at the data coming back and see how to make sense of them.
# expect the columns in the DataFrame to be a proper subset of FL_DEFAULT


def set_diff(a, b):
    return set(a) - set(b), set(b) - set(a)
    


assert set(df.columns) - set(FL_DEFAULT) == set()


len(df)

In [None]:
# can I get type information from the API?
# it doesn't seem like /thing/select will return type information

# save a copy of df to df0
df0 = df.copy()

# df.infer_objects().dtypes
df = df.convert_dtypes()

# some of the columns are datetimes
for k in ['sourceUpdatedTime', 'producedBy_resultTime', 'producedBy_resultTimeRange']:
    df[k] = pd.to_datetime(df[k], errors='coerce').dt.tz_localize(None)

# spit out to Excel to look at the data in spreadsheet form
df.to_excel('bone.xlsx')


In [None]:
df['sourceUpdatedTime'].describe()

In [None]:
import matplotlib.pyplot as plt

# Convert the datetime64 column to just date
df['sourceUpdatedTime'] = df['sourceUpdatedTime'].dt.date

# Plot a histogram
plt.figure(figsize=(10,6))
df['sourceUpdatedTime'].hist(rwidth=0.9, bins=30)
plt.title('Distribution of sourceUpdatedTime')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.show()

In [None]:
import ipydatagrid as ipg
ipg.__version__

In [None]:
len(df)

In [None]:
# load the df into ipydatagrid
from ipydatagrid import DataGrid

dg = DataGrid(df, editable=True)
dg

In [None]:
df.shape

In [None]:
# what type of events are supported by ipydatagrid
# selection events, what rows are shown? what columns?
dg.selection_mode, dg.selected_cells

In [None]:
def analyze_selection_from_dg(selection, total_rows, total_columns):
    # Initialize counters
    row_counts = {}
    
    # Process each selected cell
    for cell in selection:
        row_index = cell['r']
        # Increment the row counter for each occurrence
        if row_index in row_counts:
            row_counts[row_index] += 1
        else:
            row_counts[row_index] = 1
    
    # Analyze the counts to determine full row selections
    full_rows_selected = [row for row, count in row_counts.items() if count == total_columns]
    
    # Report findings
    if full_rows_selected:
        print(f"Full rows selected: Rows {full_rows_selected}")
    else:
        print("No full rows selected.")

# Assuming you have a DataFrame 'df' and a DataGrid 'dg' with selections as described
total_rows, total_columns = df.shape  # As per your DataFrame's shape

# Example selection (assuming this comes from dg.selected_cells)
selected_cells =  dg.selected_cells
# Analyze the selection
analyze_selection_from_dg(selected_cells, total_rows, total_columns)

In [None]:
# use Jupyter widgets to allow for change in searchText and display the number of results in a output widget

import ipywidgets as widgets
from IPython.display import display

logging.getLogger().setLevel(logging.CRITICAL)

# monkeypatch pysolr?
monkey_patch_select(active=True)
SWITCH_TO_POST = 100000

cli = IsbClient2()

# build fq: OpenContext source and search for bone
fq = cli._fq_from_kwargs(searchText="bone")
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=10, **FACET_RANGE_FIELDS_DEFAULT)
query = cli.search(params=params)
num_hits = len(query)

# Create a text input widget
search_text = widgets.Text(
    value='',
    placeholder='Type something',
    description='Search:',
)

# add a date range widget

producedby_range_slider = widgets.IntRangeSlider(
    value=[1800, 2024],
    min=1800,
    max=2024,
    step=1,
    description='ProducedBy ResultTime:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)

# Create an output widget
output = widgets.Output()

# Define a function to handle changes to the text input
def on_text_change(change):
    output.clear_output()  # Clear the previous results

    # Get the new search text and range values
    new_search_text = search_text.value
    new_range = producedby_range_slider.value

    fq = cli._fq_from_kwargs(searchText=new_search_text, collection_date_start=new_range[0], collection_date_end=new_range[1])
    params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=10, **FACET_RANGE_FIELDS_DEFAULT)
    query = cli.search(params=params)
    num_hits = len(query)

    with output:
        print(f"Number of hits: {num_hits}")  # Display the new search text

# Attach the event handler to the text input and range slider
search_text.observe(on_text_change, names='value')
producedby_range_slider.observe(on_text_change, names='value')

# Display the widgets
# align the widgets vertically
display(widgets.VBox([producedby_range_slider, search_text, output]))


In [None]:
params

In [None]:
# write out the call to iSamples using httpx to compare get vs post

import httpx
ISB_SERVER = "https://central.isample.xyz/isamples_central/"

r = httpx.request('GET', f'{ISB_SERVER}/thing/select', params=params)
r.json()['response']['numFound']

In [None]:
# make a post request version

from urllib.parse import urlencode

headers = {
    "Content-type": "application/x-www-form-urlencoded; charset=utf-8"
}

params_encoded = urlencode(params)
r = httpx.post(f'{ISB_SERVER}/thing/select', data=params_encoded, headers=headers)
r

In [None]:
r.json()

In [None]:
assert set(query.raw_response.keys()) == set(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])

In [None]:
# dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts'].keys()

query.raw_response['facet_counts']['facet_fields'].keys()

In [None]:
query.raw_response['facet_counts']['facet_fields']['source']

In [None]:
from ipytree import Tree, Node
from ipyleaflet import Map, Marker
from ipywidgets import HBox, link, Layout

m = Map(center=[47.51, 4.04], zoom=4, layout=Layout(height='400px'))
tree = Tree()
tree.layout.width = '40%'
box = HBox([tree, m])

markers_node = Node('Markers')
tree.add_node(markers_node)

layers_node = Node('Layers', icon='map')
tree.add_node(layers_node)

cities = [
    {'name': 'London', 'location': [51.5074, 0.1278]},
    {'name': 'Paris', 'location': [48.8566, 2.3522]},
    {'name': 'Barcelona', 'location': [41.31, 2.109]}
]

for city in cities:
    marker = Marker(location=city.get('location'))
    node = Node(city.get('name'), icon='map-marker')

    link((marker, 'visible'), (node, 'selected'))

    m.add_layer(marker)
    markers_node.add_node(node)

box

In [None]:
# query.raw_response.keys() --> dict_keys(['responseHeader', 'response', 'nextCursorMark', 'facet_counts'])
query.raw_response['facet_counts']['facet_ranges'].keys()

In [None]:
# keys: dict_keys(['facet_queries', 'facet_fields', 'facet_ranges', 'facet_intervals', 'facet_heatmaps'])
query.raw_response['facet_counts']['facet_ranges'].keys()

In [None]:
# 'responseHeader', 'index', 'schema', 'info'
r = cli._request("thing/select/info")
r.keys()

In [None]:
r['schema']['fields'].keys()

In [None]:
# timeout internal server error -- skip trying to query thing/types right now. https://github.com/isamplesorg/isamples_inabox/issues/351
if False:
    r = cli._request("thing/types")

In [None]:
# types and classnames for all the fields on the system
Counter([(x['type'], r['schema']['types'][x['type']]['className']) for x in r['schema']['fields'].values()])

In [None]:
# e.g, I for Indexed, T for Tokenized, S for Stored, etc.
r['info']['key']

# ['fields', 'dynamicFields', 'uniqueKeyField', 'similarity', 'types']
r['schema'].keys()

# get the fields -- 78 of them
print ("number of fields", len(r['schema']['fields'].keys()))

field_names = cli.field_names()
print("number of field names (another way to access)", len(field_names))

print ("types for the major fields")
[(k,v['type'], r['schema']['types'][v['type']]['className'] ) for (k,v) in r['schema']['fields'].items() if k in MAJOR_FIELDS.values()]

In [None]:
from urllib.parse import urlparse, parse_qs

url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&fl=searchText%20authorizedBy%20producedBy_resultTimeRange%20hasContextCategory%20curation_accessContraints%20curation_description_text%20curation_label%20curation_location%20curation_responsibility%20description_text%20id%20informalClassification%20keywords%20label%20hasMaterialCategory%20producedBy_description_text%20producedBy_hasFeatureOfInterest%20producedBy_label%20producedBy_responsibility%20producedBy_resultTime%20producedBy_samplingSite_description_text%20producedBy_samplingSite_label%20producedBy_samplingSite_location_elevationInMeters%20producedBy_samplingSite_location_latitude%20producedBy_samplingSite_location_longitude%20producedBy_samplingSite_placeName%20registrant%20samplingPurpose%20source%20sourceUpdatedTime%20producedBy_samplingSite_location_rpt%20hasSpecimenCategory&fq=producedBy_resultTimeRange%3A%5B1800%20TO%202023%5D&fq=source%3A(%22OPENCONTEXT%22%20OR%20%22SESAR%22)&fq=-relation_target%3A*&facet.field=authorizedBy&facet.field=hasContextCategory&facet.field=hasMaterialCategory&facet.field=registrant&facet.field=source&facet.field=hasSpecimenCategory&facet.range=producedBy_resultTimeRange&facet.range.gap=%2B1YEARS&facet.range.start=1800-01-01T00:00:00Z&facet.range.end=2023-01-01T00:00:00Z&f.registrant.facet.sort=count&f.source.facet.sort=index&rows=20&facet.limit=-1&facet.sort=index&&start=0&facet=on&wt=json'

parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# The result is a dictionary where each key is associated with a list of values.
# You can iterate over this dictionary to process your parameters as needed.
for key, values in query_params.items():
    print(f"{key}: {values}")

# If you need each key to have a single value (taking the first value if multiple are present),
# you can do the following:
single_value_params = {key: values[0] for key, values in query_params.items()}
print(single_value_params)


In [None]:
# simplest query -- default

cli._request("thing/select", params={'q': '*:*', 'start':0, 'rows': 10, 
        'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT or SESAR)', '-relation_target:*'],
        'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
        'facet': 'on',
        })

I *think* I had ChatGPT parse the parameters and give me the following interpretation:

Let's break down these parameters, which are used for querying a Solr search engine. Solr is an open-source search platform that provides a wide range of capabilities for text search and faceted search, among other features.

q: This parameter specifies the query. Here, *:* is a wildcard query, meaning it matches all documents in the Solr index.


[fl](https://solr.apache.org/guide/8_11/common-query-parameters.html#fl-field-list-parameter): This stands for "field list". It specifies the fields to return in the result. In your query, a long list of fields like searchText, authorizedBy, producedBy_resultTimeRange, etc., are included. Only these fields will be returned for each document in the search results.

fq: This is the "filter query". It filters the results returned by the main query (q) without influencing the score. Here, there are three filters applied:

> producedBy_resultTimeRange:[1800 TO 2023] filters documents to those produced between the years 1800 and 2023.
source:(OPENCONTEXT) filters documents where the source field matches "OPENCONTEXT".
-relation_target:* excludes documents where the relation_target field exists.
facet.field: Faceting is used to aggregate data based on a field. This parameter specifies the fields for which you want to see facet counts. Facets on fields like authorizedBy, hasContextCategory, etc., are requested.


facet.range, facet.range.gap, facet.range.start, and facet.range.end: These parameters are used for range faceting. You are faceting on the producedBy_resultTimeRange field, starting from "1800-01-01T00:00:00Z" to "2023-01-01T00:00:00Z", with a gap of "+1YEARS". This means it will provide counts for each year in this range.

f.registrant.facet.sort and f.source.facet.sort: These are sorting instructions for the facets. The registrant facet is sorted by count, and the source facet is sorted by index.

rows: This specifies the number of documents to return. In your query, it's set to 20.

facet.limit: This limits the number of facet values returned for each facet field. -1 means no limit.

facet.sort: It dictates how to sort the facet fields. Here, it's sorted by index.

start: This is the offset in the complete result set for pagination. It tells Solr where to start in the list of results (useful for paging through results).

facet: When set to 'on', it enables faceting.

wt: This stands for "writer type" and specifies the output format. Here, 'json' indicates that the response should be in JSON format.

In [None]:
import httpx


url = "https://central.isample.xyz/isamples_central/thing/select"
params = {
    'q': '*:*',
    'fl': 'searchText authorizedBy producedBy_resultTimeRange hasContextCategory curation_accessContraints curation_description_text curation_label curation_location curation_responsibility description_text id informalClassification keywords label hasMaterialCategory producedBy_description_text producedBy_hasFeatureOfInterest producedBy_label producedBy_responsibility producedBy_resultTime producedBy_samplingSite_description_text producedBy_samplingSite_label producedBy_samplingSite_location_elevationInMeters producedBy_samplingSite_location_latitude producedBy_samplingSite_location_longitude producedBy_samplingSite_placeName registrant samplingPurpose source sourceUpdatedTime producedBy_samplingSite_location_rpt hasSpecimenCategory',
    'fq': ['producedBy_resultTimeRange:[1800 TO 2023]', 'source:(OPENCONTEXT)', '-relation_target:*'],
    'facet.field': ['authorizedBy', 'hasContextCategory', 'hasMaterialCategory', 'registrant', 'source', 'hasSpecimenCategory'],
    'facet.range': 'producedBy_resultTimeRange',
    'facet.range.gap': '+1YEARS',
    'facet.range.start': '1800-01-01T00:00:00Z',
    'facet.range.end': '2023-01-01T00:00:00Z',
    'f.registrant.facet.sort': 'count',
    'f.source.facet.sort': 'index',
    'rows': '20',
    'facet.limit': '-1',
    'facet.sort': 'index',
    'start': '20',
    'facet': 'on',
    'wt': 'json'
}
headers = {
    'Accept': 'application/json',
    'User-Agent': 'raymondyee.net'
}

# keys in response: 'responseHeader', 'response', 'facet_counts'
response = httpx.get(url, params=params, headers=headers)


In [None]:
# get back parameters that went into the query and some basic metadata
response.json()['responseHeader']

In [None]:
# 'numFound', 'start', 'numFoundExact', 'docs'
response.json()['response'].keys()

(response.json()['response']['numFound'], response.json()['response']['numFoundExact'])


In [None]:
response.json()['response']['docs'][0].keys()

# plotting the collection dates

In [None]:
import httpx

cli = IsbClient2()
fq = cli._fq_from_kwargs(source=('OPENCONTEXT',))
params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)


url = 'https://central.isample.xyz/isamples_central/thing/select/info'
url = 'https://central.isample.xyz/isamples_central/thing/select?q=*:*&facet=true&facet.range=producedBy_resultTimeRange&facet.range.start=NOW/YEAR-200YEARS&facet.range.end=NOW/YEAR&facet.range.gap=YEAR'


headers = {
    'accept': 'application/json'
}

response = httpx.get('https://central.isample.xyz/isamples_central/thing/select', headers=headers, params=params)

print(response.json())


In [None]:
response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']

In [None]:

k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
dict(zip(k[::2], k[1::2]))



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data is your response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))


# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract the year from the date
df['Year'] = df['Date'].dt.year

# Count the occurrences of each year
year_counts = df['Year'].value_counts().sort_index()

# Plot the counts vs year
year_counts.plot(kind='line')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Count vs Year')
plt.show()


In [None]:
k = response.json()['facet_counts']['facet_ranges']['producedBy_resultTimeRange']['counts']
data = dict(zip(k[::2], k[1::2]))

df = pd.DataFrame(list(data.items()), columns=['Date', 'Count'])
df.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming df is your DataFrame
df['Date'] = pd.to_datetime(df['Date'])

# deal with log scale
df = df.loc[df['Count'] != 0]

# df['Count'] = df['Count'].replace(0, np.nan)
# df['Count'] = df['Count'].fillna(0.1)

plt.figure(figsize=(10,6))
plt.scatter(df['Date'], df['Count'], color='green', alpha=0.5, s=10)
plt.yscale('log')

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Count over Date')
plt.show()

In [None]:
%%bash

curl -X 'GET' \
  'https://central.isample.xyz/isamples_central/thing/select?facet=true&facet.mincount=0&facet.field=source' \
  -H 'accept: application/json'

In [None]:
len(field_names)

In [None]:
# get OpenContext sourced records
# fq = cli._fq_from_kwargs(source=('OPENCONTEXT', 'SESAR'), collection_date_end=str(datetime.now().year))
fq = cli._fq_from_kwargs(collection_date_end=str(datetime.now().year))

params = cli.default_search_params(fq=fq, fl=FL_DEFAULT, rows=100, **FACET_RANGE_FIELDS_DEFAULT)


In [None]:
# Get counts of values grouping by three dimensions: source, hasMaterialCategory, and hasContextCategory
dimensions = ["source", "hasMaterialCategory", "hasContextCategory"]
xd = cli.pivot(params, dimensions)
print(xd.loc["geome", "organic material", "bacteria"].sum())

In [None]:
# Sum by axis 2 (hasContextCategory) and print
df = xd.sum(axis=2).to_pandas()
# display transposed
display(df.T)


In [None]:
print(xd.loc["sesar", "rock"].sum())

In [None]:
# Field names in solr
for name in cli.field_names():
    print(name)

# Bulk Download


[isamples\_inabox/docs/export\_service.md at develop · isamplesorg/isamples\_inabox](https://github.com/isamplesorg/isamples_inabox/blob/develop/docs/export_service.md)



In [None]:
import os

ISAMPLES_TOKEN = os.environ.get("ISAMPLES_TOKEN")


In [None]:
%%pybash

echo "{ISAMPLES_TOKEN}"

In [None]:
%%pybash

curl  -H "Authorization: Bearer {ISAMPLES_TOKEN}" "https://central.isample.xyz/isamples_central/export/create?q=source:SMITHSONIAN&export_format=jsonl"


In [None]:
%%pybash

curl  "https://central.isample.xyz/isamples_central/export/status?uuid=3a352569-cd03-488f-880b-e1a1252f2b18"


In [None]:
%%pybash

curl -o /tmp/3a352569-cd03-488f-880b-e1a1252f2b18.jsonl "https://central.isample.xyz/isamples_central/export/download?uuid=3a352569-cd03-488f-880b-e1a1252f2b18"


In [None]:
%%pybash

ls -lt /tmp/3a352569-cd03-488f-880b-e1a1252f2b18.jsonl

In [None]:
fname = "/tmp/3a352569-cd03-488f-880b-e1a1252f2b18.jsonl"

import pandas as pd
import numpy as np



df_bulk = pd.read_json("/tmp/3a352569-cd03-488f-880b-e1a1252f2b18.jsonl", lines=True)
df_bulk

In [None]:
import requests
import pandas as pd
from isbclient import ISamplesBulkHandler


In [None]:

query = "source:SMITHSONIAN"

ish = ISamplesBulkHandler(token=ISAMPLES_TOKEN)
uuid = ish.create_download(query)


In [None]:
ish.get_status(uuid)

In [None]:
ish.download_file(uuid, f"/tmp/{uuid}.jsonl")

In [None]:
df_bulk = ish.load_dataset_to_dataframe(f"/tmp/{uuid}.jsonl")
df_bulk

# Geoparquet

In [None]:
import duckdb

# Connect to an in-memory DuckDB instance
con = duckdb.connect()

# Load the GeoParquet file
geo_parquet_file = '/Users/raymondyee/Data/iSample/2024_06_07_07_40_00/isamples_export_2024_06_07_07_40_00_geo.parquet'

# Query the GeoParquet file
query = f"SELECT * FROM read_parquet('{geo_parquet_file}')"
df = con.execute(query).df()

# Display the first few rows of the dataframe
print(df.head())

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Convert the DuckDB dataframe to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Plot the geospatial data
gdf.plot()
plt.show()

In [None]:
import datashader as ds
import datashader.transfer_functions as tf
from datashader.utils import lnglat_to_meters

# Convert longitude and latitude to meters for better visualization
gdf['x'], gdf['y'] = lnglat_to_meters(gdf.geometry.x, gdf.geometry.y)

# Create a canvas for the plot
canvas = ds.Canvas(plot_width=800, plot_height=600)

# Aggregate the data
agg = canvas.points(gdf, 'x', 'y')

# Create an image from the aggregated data
img = tf.shade(agg, cmap='viridis')

# Display the image
img.to_pil().show()