In [1]:
# Simple example notebook showing how to access the database of downloaded waveforms
# and illustrating some pitfalls

In [2]:
# DATABASE PATH
import os
# Use the database path in 'jupyter folder' (test database with only 2 segments)
# change the db path to your needs .If using postgres, you do not need
# the two lines below:
dbpath = os.path.abspath(os.path.join(os.getcwd(), 'jupyter.example.db'))
assert os.path.isfile(dbpath)
dbpath = 'sqlite:///' + dbpath
# print ('Database path is: %s' % dbpath)

# Create the Database session (SQLALchemy session):
from stream2segment.process.db import get_session
session=get_session(dbpath)
# TEST: query all segments and count them (uncomment only if you want to check):
# print("%d segments on the database" % session.query(Segment).count())

In [3]:
# Select segments to process
from stream2segment.process.db import Segment
from stream2segment.io.db.sqlevalexpr import exprquery

segment_select = {
  'has_data': 'true',
  'maxgap_numsamples': '[-0.5, 0.5]',
  'event_distance_deg': '[20, 90]'
  # missing_data_sec: '<120'
  # missing_data_ratio: '<0.5'
  # id: '<300'
  # event.time: "(2014-01-01T00:00:00, 2014-12-31T23:59:59)"
  # event.latitude: "[24, 70]"
  # event.longitude: "[-11, 24]"
}

# Create a query Object (an iterable of Segment objects, see notebook cell at the end)
query = exprquery(session.query(Segment), segment_select)
# TEST: query all segments and count them (uncomment only if you want to check):
# print("%d segments on the database" % query.count())

In [4]:
# Example 1: Iterate over all segments
for segment in query:
    # custom process here (Simple example: print Segment string representation:
    # look at the attributes ('columns') and related Objects ('relationships')
    print(str(segment) + "\n")
# close the database session:
session.close()

Segment
 columns (17 of 17 loaded):
  id: 1
  event_id: 1
  channel_id: 2
  datacenter_id: 1
  data_seed_id: GE.RUE..BHZ
  event_distance_deg: 88.40960422432707
  data: b'808468D RUE    BHZGE\x07\xe1\x00\xfb\x05\x00\x00\x00\x13V ... (showing first 30 characters only)'
  download_code: 200
  start_time: 2017-09-08T05:00:00.495000
  arrival_time: 2017-09-08T05:02:05.252870
  end_time: 2017-09-08T05:04:12.245000
  sample_rate: 20.0
  maxgap_numsamples: 0.0
  download_id: 2
  request_start: 2017-09-08T05:00:05
  request_end: 2017-09-08T05:04:05
  queryauth: False
 relationships (0 of 6 loaded):
  event
  channel
  station
  classes
  datacenter
  download

Segment
 columns (17 of 17 loaded):
  id: 2
  event_id: 1
  channel_id: 1
  datacenter_id: 1
  data_seed_id: GE.MTE..BHZ
  event_distance_deg: 77.53363626270473
  data: b'609801D MTE    BHZGE\x07\xe1\x00\xfb\x04:1\x00\x03\xb6 ... (showing first 30 characters only)'
  download_code: 200
  start_time: 2017-09-08T04:58:49.095000
  arrival_t

In [5]:
# Example 2: Iterate over all segments with lazy load
# For large datasets, loading all Segment attributes into memory might be an overhead,
# especially if you need only some of them. The drawback is that the access to any of
# the Segment attributes will force a further query to the database. This might be time consuming
# (especially if the database is located remotely on some machine using PostrgresSQL)

from sqlalchemy.orm import load_only

for segment in query.options(load_only(Segment.id)):
    # custom process here (Simple example: print Segment string representation:
    # look at the attributes ('columns') and see that only 'id' shows up
    print(str(segment) + "\n")
# close the datbase session:
session.close()

Segment
 columns (1 of 17 loaded):
  id: 1
  event_id
  channel_id
  datacenter_id
  data_seed_id
  event_distance_deg
  data
  download_code
  start_time
  arrival_time
  end_time
  sample_rate
  maxgap_numsamples
  download_id
  request_start
  request_end
  queryauth
 relationships (0 of 6 loaded):
  event
  channel
  station
  classes
  datacenter
  download

Segment
 columns (1 of 17 loaded):
  id: 2
  event_id
  channel_id
  datacenter_id
  data_seed_id
  event_distance_deg
  data
  download_code
  start_time
  arrival_time
  end_time
  sample_rate
  maxgap_numsamples
  download_id
  request_start
  request_end
  queryauth
 relationships (0 of 6 loaded):
  event
  channel
  station
  classes
  datacenter
  download



In [6]:
# Example 3: Related Objects are "lazy loaded":
for segment in query:
    # custom process here (Simple example: print Segment string representation:
    # look at the related Objects ('relationships'): 0 of 6 loaded
    print(str(segment) + "\n")
    # Now access the segment's station, either via `segment.station` or by using
    # segment.stream
    segment.station
    # and look now at the related objects (1 of 6 loaded):
    print(str(segment) + "\n")
    # exit: just show the example for the first segment
    break
session.close()

Segment
 columns (17 of 17 loaded):
  id: 1
  event_id: 1
  channel_id: 2
  datacenter_id: 1
  data_seed_id: GE.RUE..BHZ
  event_distance_deg: 88.40960422432707
  data: b'808468D RUE    BHZGE\x07\xe1\x00\xfb\x05\x00\x00\x00\x13V ... (showing first 30 characters only)'
  download_code: 200
  start_time: 2017-09-08T05:00:00.495000
  arrival_time: 2017-09-08T05:02:05.252870
  end_time: 2017-09-08T05:04:12.245000
  sample_rate: 20.0
  maxgap_numsamples: 0.0
  download_id: 2
  request_start: 2017-09-08T05:00:05
  request_end: 2017-09-08T05:04:05
  queryauth: False
 relationships (0 of 6 loaded):
  event
  channel
  station
  classes
  datacenter
  download

Segment
 columns (17 of 17 loaded):
  id: 1
  event_id: 1
  channel_id: 2
  datacenter_id: 1
  data_seed_id: GE.RUE..BHZ
  event_distance_deg: 88.40960422432707
  data: b'808468D RUE    BHZGE\x07\xe1\x00\xfb\x05\x00\x00\x00\x13V ... (showing first 30 characters only)'
  download_code: 200
  start_time: 2017-09-08T05:00:00.495000
  arriva

In [7]:
# Example 4: Accessing the segment's Obspy Objects
for segment in query:
    # 1) GET Obspy Stream Object (representing the waveform data)
    stream = segment.stream()
    print('Stream data (first 3 points):' + str(stream[0].data[:3]))
    # 2) TO REMOVE THE RESPONSE:
    # Get the segment inventory (this will access `segment.station`, see example above)
    inventory = segment.inventory()
    # remove the response IN PLACE:
    stream_remresp = stream.remove_response(inventory)

    # NOTE: segment.stream() NOW returns stream_remresp (response removed)
    # This is not due to Stream2segment but to a specific design choice of Obspy developers.
    # Check this:
    stream = segment.stream()
    print('Stream data PERMANENTLY MODIFIED (first 3 points):' + str(stream[0].data[:3]))
    # If you want to preserve segment.stream() use `copy()`:
    stream_remresp = stream.copy().remove_response(inventory)
    print('Stream data NOT MODIFIED (first 3 points):' + str(stream[0].data[:3]))
    # just show the first segment (no need to print stuff twice):
    break
session.close()

Stream data (first 3 points):[196 211  94]
Stream data PERMANENTLY MODIFIED (first 3 points):[  5.82973528e-07   5.26137453e-07   5.78937133e-07]
Stream data NOT MODIFIED (first 3 points):[  5.82973528e-07   5.26137453e-07   5.78937133e-07]


In [8]:
# Segment attributes (db columns) which are accessible during processing or can be used in 'segment_select':

# NOTE ABOVE: The parameter 'segment_select' defines what segments to be processed or
# visualized. If this argument is missing, all segments will be processed or
# (from within the GUI) visualized. The selection is made via the list-like argument:
#
# segment_select:
#   <att>: "<expression>"
#   <att>: "<expression>"
#   ...
#
# where each <att> is a segment attribute and <expression> is a simplified SQL-select string
# expression. Example:
#
# 1. To select and work on segments with downloaded data (at least one byte of data):
# segment_select:
#   has_data: "true"
#
# 2. To select and work on segments of stations activated in 2017 only:
# segment_select:
#   station.start_time: "[2017-01-01, 2018-01-01T00:00:00)"
# (brackets denote intervals. Square brackets include end-points, round brackets exclude endpoints)
#
# 3. To select segments from specified ids, e.g. 1, 4, 342, 67 (e.g., ids which raised errors during
# a previous run and whose id where logged might need inspection in the GUI):
# segment_select:
#   id: "1 4 342 67"
#
# 4. To select segments whose event magnitude is greater than 4.2:
# segment_select:
#   event.magnitude: ">4.2"
# (the same way work the operators: =, >=, <=, <, !=)
#
# 5. To select segments with a particular channel sensor description:
# segment_select:
#   channel.sensor_description: "'GURALP CMG-40T-30S'"
# (note: for attributes with str values and spaces, we need to quote twice, as otherwise
# "GURALP CMG-40T-30S" would match 'GURALP' and 'CMG-40T-30S', but not the whole string.
# See attribute types below)
#
# The list of segment attribute names and types is:
#
# ============================= ================================================
# attribute                     python type and description (if any)
# ============================= ================================================
# id                            int: segment (unique) db id
# event_distance_deg            float: distance between the segment's station and
#                               the event, in degrees
# event_distance_km             float: distance between the segment's station and
#                               the event, in km, assuming a perfectly spherical earth
#                               with a radius of 6371 km
# start_time                    datetime.datetime: the waveform data start time
# arrival_time                  datetime.datetime
# end_time                      datetime.datetime: the waveform data end time
# request_start                 datetime.datetime: the requested start time of the data
# request_end                   datetime.datetime: the requested end time of the data
# duration_sec                  float: the waveform data duration, in seconds
# missing_data_sec              float: the number of seconds of missing data, with respect
#                               to the request time window. E.g. if we requested 5
#                               minutes of data and we got 4 minutes, then
#                               missing_data_sec=60; if we got 6 minutes, then
#                               missing_data_sec=-60. This attribute is particularly
#                               useful in the config to select only well formed data and
#                               speed up the processing, e.g.: missing_data_sec: '< 120'
# missing_data_ratio            float: the portion of missing data, with respect
#                               to the request time window. E.g. if we requested 5
#                               minutes of data and we got 4 minutes, then
#                               missing_data_ratio=0.2 (20%); if we got 6 minutes, then
#                               missing_data_ratio=-0.2. This attribute is particularly
#                               useful in the config to select only well formed data and
#                               speed up the processing, e.g.: missing_data_ratio: '< 0.5'
# sample_rate                   float: the waveform data sample rate.
#                               It might differ from the segment channel's sample_rate
# has_data                      boolean: tells if the segment has data saved (at least
#                               one byte of data). This attribute useful in the config to
#                               select only well formed data and speed up the processing,
#                               e.g. has_data: 'true'.
# download_code                 int: the download code (for experienced users). As for
#                               any HTTP status code,
#                               values between 200 and 399 denote a successful download
#                               (this does not tell anything about the segment's data,
#                               which might be empty anyway. See 'segment.has_data'.
#                               Conversely, a download error assures no data has been
#                               saved), whereas
#                               values >=400 and < 500 denote client errors and
#                               values >=500 server errors.
#                               Moreover,
#                               -1 indicates a general download error - e.g. no Internet
#                               connection,
#                               -2 a successful download with corrupted waveform data,
#                               -200 a successful download where some waveform data chunks
#                               (miniSeed records) have been discarded because completely
#                               outside the requested time span,
#                               -204 a successful download where no data has been saved
#                               because all chunks were completely outside the requested
#                               time span, and finally:
#                               None denotes a successful download where no data has been
#                               saved because the given segment wasn't found in the
#                               server response (note: this latter case is NOT the case
#                               when the server returns no data with an appropriate
#                               'No Content' message with download_code=204)
# maxgap_numsamples             float: the maximum gap found in the waveform data, in
#                               number of points. This attribute is particularly useful
#                               in the config to select only well formed data and speed
#                               up the processing.
#                               If this attribute is zero, the segment has no
#                               gaps/overlaps, if >=1 the segment has gaps, if <=-1,
#                               the segment has overlaps.
#                               Values in (-1, 1) are difficult to interpret: as this
#                               number is the ratio between
#                               the waveform data's max gap/overlap and its sampling
#                               period (both in seconds), a rule of thumb is to
#                               consider a segment with gaps/overlaps when this
#                               attribute's absolute value exceeds 0.5, e.g. you can
#                               discard segments with gaps overlaps by inputting in the
#                               config "maxgap_numsamples:  '[-0.5, 0.5]'" and, if you
#                               absolutely want no segment with gaps/overlaps,
#                               perform a further check in the processing via
#                               `len(segment.stream())` (zero if no gaps/overlaps) or
#                               `segment.stream().get_gaps()` (see obspy doc)
# data_seed_id                  str: the seed identifier in the typical format
#                               [Network.Station.Location.Channel] stored in the
#                               segment's data. It might be null if the data is empty
#                               or null (e.g., because of a download error).
#                               See also 'segment.seed_id'
# seed_id                       str: the seed identifier in the typical format
#                               [Network.Station.Location.Channel]: it is the same as
#                               'segment.data_seed_id' if the latter is not null,
#                               otherwise it is fetched from the segment's metadata
#                               (in this case, the operation might more time consuming)
# has_class                     boolean: tells if the segment has (at least one) class
#                               assigned
# data                          bytes: the waveform (raw) data. You don't generally need
#                               to access this attribute which is also time-consuming
#                               to fetch. Used by `segment.stream()`
# ----------------------------- ------------------------------------------------
# event                         object (attributes below)
# event.id                      int
# event.event_id                str: the id returned by the web service
# event.time                    datetime.datetime
# event.latitude                float
# event.longitude               float
# event.depth_km                float
# event.author                  str
# event.catalog                 str
# event.contributor             str
# event.contributor_id          str
# event.mag_type                str
# event.magnitude               float
# event.mag_author              str
# event.event_location_name     str
# ----------------------------- ------------------------------------------------
# channel                       object (attributes below)
# channel.id                    int
# channel.location              str
# channel.channel               str
# channel.depth                 float
# channel.azimuth               float
# channel.dip                   float
# channel.sensor_description    str
# channel.scale                 float
# channel.scale_freq            float
# channel.scale_units           str
# channel.sample_rate           float
# channel.band_code             str: the first letter of channel.channel
# channel.instrument_code       str: the second letter of channel.channel
# channel.orientation_code      str: the third letter of channel.channel
# channel.station               object: same as segment.station (see below)
# ----------------------------- ------------------------------------------------
# station                       object (attributes below)
# station.id                    int
# station.network               str
# station.station               str
# station.latitude              float
# station.longitude             float
# station.elevation             float
# station.site_name             str
# station.start_time            datetime.datetime
# station.end_time              datetime.datetime
# station.inventory_xml         bytes. The station inventory (raw) data. You don't
#                               generally need to access this attribute which is also
#                               time-consuming to fetch. Used by `segment.inventory()`
# station.has_inventory         boolean: tells if the segment's station inventory has
#                               data saved (at least one byte of data).
#                               This attribute useful in the config to select only
#                               segments with inventory downloaded and speed up the
#                               processing,
#                               e.g. has_inventory: 'true'.
# station.datacenter            object (same as segment.datacenter, see below)
# ----------------------------- ------------------------------------------------
# datacenter                    object (attributes below)
# datacenter.id                 int
# datacenter.station_url        str
# datacenter.dataselect_url     str
# datacenter.organization_name  str
# ----------------------------- ------------------------------------------------
# download                      object (attributes below): the download execution
# download.id                   int
# download.run_time             datetime.datetime
# download.log                  str: The log text of the segment's download execution.
#                               You don't generally need to access this
#                               attribute which is also time-consuming to fetch.
#                               Useful for advanced debugging / inspection
# download.warnings             int
# download.errors               int
# download.config               str
# download.program_version      str
# ----------------------------- ------------------------------------------------
# classes.id                    int: the id(s) of the classes assigned to the segment
# classes.label                 int: the label(s) of the classes assigned to the segment
# classes.description           int: the description(s) of the classes assigned to the
#                               segment
# ============================= ================================================
#
