# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# fix working dir
import pathlib
import os

path = os.path.join(pathlib.Path().absolute(), "../")
os.chdir(path)

In [3]:
# imports
from pyspark.sql import SparkSession
import pydeequ
import json
import datetime
import os

Please set env variable SPARK_VERSION


In [4]:
# spark context
spark = (
    SparkSession.builder.config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .appName("thoth")
    .getOrCreate()
)

22/10/17 10:57:50 WARN Utils: Your hostname, rleinio-pc resolves to a loopback address: 127.0.1.1; using 192.168.1.132 instead (on interface enp8s0)
22/10/17 10:57:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/rleinio/.ivy2/cache
The jars for the packages stored in: /home/rleinio/.ivy2/jars
:: loading settings :: url = jar:file:/home/rleinio/.pyenv/versions/3.9.13/envs/thoth-3.9.13/lib/python3.9/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.amazon.deequ#deequ added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-87774447-23f0-45ab-bc80-fa5dc0f85bb4;1.0
	confs: [default]
	found com.amazon.deequ#deequ;1.2.2-spark-3.0 in central
	found org.scalanlp#breeze_2.12;0.13.2 in central
	found org.scalanlp#breeze-macros_2.12;0.13.2 in central
	found org.scala-lang#scala-reflect;2.12.1 in central
	found com.github.fommil.netlib#core;1.1.2 in central
	found net.sf

In [5]:
# metrics repository connection
os.environ["DATABASE_URL"] = os.environ.get(
    "DATABASE_URL",
    "postgresql+pg8000://postgres:postgres@localhost:5432/metrics_repository",
)

# Load Dataset

In [6]:
with open("sample_datasets/temperatures_extended.json") as f:
    json_data = [
        {**record, "ts": datetime.datetime.fromisoformat(record.get("ts"))}
        for record in json.load(f)
    ]
print("Dataset head: ", json_data[:5], "\n")
print("Dataset tail: ", json_data[-5:], "\n")
print("Dataset number of records: ", len(json_data), "\n")
print(
    "Dataset number of ts daily partitions: ",
    len(set(record.get("ts").date() for record in json_data)),
)

Dataset head:  [{'ts': datetime.datetime(1981, 1, 1, 7, 23, 33, tzinfo=datetime.timezone.utc), 'value': 22.1467670458884, 'sensor': 'Sensor E'}, {'ts': datetime.datetime(1981, 1, 1, 21, 57, 57, tzinfo=datetime.timezone.utc), 'value': 22.8849008762327, 'sensor': 'Sensor C'}, {'ts': datetime.datetime(1981, 1, 1, 12, 11, 56, tzinfo=datetime.timezone.utc), 'value': 22.618233805151977, 'sensor': 'Sensor B'}, {'ts': datetime.datetime(1981, 1, 1, 20, 5, 54, tzinfo=datetime.timezone.utc), 'value': 25.770158591638953, 'sensor': 'Sensor E'}, {'ts': datetime.datetime(1981, 1, 1, 10, 45, 2, tzinfo=datetime.timezone.utc), 'value': 23.005804204490918, 'sensor': 'Sensor B'}] 

Dataset tail:  [{'ts': datetime.datetime(1981, 12, 31, 13, 22, 46, tzinfo=datetime.timezone.utc), 'value': 23.434759603073424, 'sensor': 'Sensor D'}, {'ts': datetime.datetime(1981, 12, 31, 12, 55, 17, tzinfo=datetime.timezone.utc), 'value': 25.313363047160777, 'sensor': 'Sensor B'}, {'ts': datetime.datetime(1981, 12, 31, 3, 4, 

## Splitting dataset into history, new scoring batches, and an artificial anomaly batch

In [7]:
# historical data with fair confidence of good quality
history_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() <= datetime.date(1981, 12, 25)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_26_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 26)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_27_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 27)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_28_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 28)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_29_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 29)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_30_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 30)
    ],
    schema="ts timestamp, value float, sensor string",
)
# Artificial anomaly: temperatures in fahrenheit instead of celsius
new_batch_1981_12_30_anomaly_df = spark.createDataFrame(
    data=[
        {
            "ts": record.get("ts"),
            "value": ((record.get("value")) * 9 / 5) + 32
            if record.get("value")
            else None,
            "sensor": record.get("sensor"),
        }
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 30)
    ],
    schema="ts timestamp, value float, sensor string",
)
# Artificial anomaly: one sensor starts to output only null values
new_batch_1981_12_30_anomaly2_df = spark.createDataFrame(
    data=[
        {
            "ts": record.get("ts"),
            "value": None
            if record.get("sensor") == "Sensor B"
            else record.get("value"),
            "sensor": record.get("sensor"),
        }
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 30)
    ],
    schema="ts timestamp, value float, sensor string",
)


# new batch of data that need quality validation (normal)
new_batch_1981_12_31_df = spark.createDataFrame(
    data=[
        record
        for record in json_data
        if record.get("ts").date() == datetime.date(1981, 12, 31)
    ],
    schema="ts timestamp, value float, sensor string",
)

# Creating the Dataset on the Metrics Repository

In [8]:
import thoth as th

In [11]:
# Setup connection and init the Metrics Repository db
from sqlmodel import Session

session = Session(th.build_engine())
th.init_db(clear=True)

## 3 Steps: Profile the history data, create dataset and optimize models for each metric

In [12]:
profiling, optimization = th.profile_create_optimize(
    df=history_df,
    dataset_uri="temperatures",
    ts_column="ts",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    optimize_last_n=100,
    optimize_target_confidence=0.99,
    session=session,
    spark=spark,
)

2022-10-17 11:03:32.280 | INFO     | thoth.service_layer:profile_create_optimize:403 - Pipeline started 👤 📈 ...
2022-10-17 11:03:32.280 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:03:33.250 | INFO     | thoth.profiler:profile:331 - Processing 359 timestamps from 1981-01-01T00:00:00 to 1981-12-25T00:00:00, with DAY granularity.
2022-10-17 11:03:33.361 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-01-01T00:00:00.
2022-10-17 11:03:33.460 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-01-02T00:00:00.
2022-10-17 11:03:33.563 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-01-03T00:00:00.
2022-10-17 11:03:33.669 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-01-04T00:00:00.
2022-10-17 11:03:33.795 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-01-05T00:00:00.
2022-

## Assessing subsequent new (normal) batches of data

In [13]:
th.assess_new_ts(
    df=new_batch_1981_12_26_df,
    ts=datetime.datetime(1981, 12, 26),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:33.938 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:33.938 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-26 00:00:00
2022-10-17 11:05:33.995 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:34.905 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-26T00:00:00 to 1981-12-26T00:00:00, with DAY granularity.
2022-10-17 11:05:35.012 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-26T00:00:00.
2022-10-17 11:05:35.013 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:35.032 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:35.036 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:35.781 | INFO     | th

True

In [14]:
th.assess_new_ts(
    df=new_batch_1981_12_27_df,
    ts=datetime.datetime(1981, 12, 27),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:35.978 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:35.978 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-27 00:00:00
2022-10-17 11:05:36.033 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:36.937 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-27T00:00:00 to 1981-12-27T00:00:00, with DAY granularity.
2022-10-17 11:05:37.040 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-27T00:00:00.
2022-10-17 11:05:37.041 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:37.058 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:37.061 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:37.992 | INFO     | th

True

In [15]:
th.assess_new_ts(
    df=new_batch_1981_12_28_df,
    ts=datetime.datetime(1981, 12, 28),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:38.182 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:38.183 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-28 00:00:00
2022-10-17 11:05:38.241 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:39.137 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-28T00:00:00 to 1981-12-28T00:00:00, with DAY granularity.
2022-10-17 11:05:39.235 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-28T00:00:00.
2022-10-17 11:05:39.235 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:39.252 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:39.255 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:40.023 | INFO     | th

True

In [16]:
th.assess_new_ts(
    df=new_batch_1981_12_29_df,
    ts=datetime.datetime(1981, 12, 29),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:40.224 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:40.225 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-29 00:00:00
2022-10-17 11:05:40.281 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:41.197 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-29T00:00:00 to 1981-12-29T00:00:00, with DAY granularity.
2022-10-17 11:05:41.302 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-29T00:00:00.
2022-10-17 11:05:41.303 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:41.320 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:41.323 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:42.070 | INFO     | th

True

## Assessing anomalous batches of data

In [17]:
th.assess_new_ts(
    df=new_batch_1981_12_30_anomaly_df,
    ts=datetime.datetime(1981, 12, 30, tzinfo=datetime.timezone.utc),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:42.258 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:42.259 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-30 00:00:00+00:00
2022-10-17 11:05:42.314 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:43.200 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-30T00:00:00 to 1981-12-30T00:00:00, with DAY granularity.
2022-10-17 11:05:43.300 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-30T00:00:00.
2022-10-17 11:05:43.301 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:43.316 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:43.319 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:44.064 | INFO   

False

In [18]:
th.assess_new_ts(
    df=new_batch_1981_12_30_anomaly2_df,
    ts=datetime.datetime(1981, 12, 30, tzinfo=datetime.timezone.utc),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:44.259 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:44.260 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-30 00:00:00+00:00
2022-10-17 11:05:44.316 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:45.220 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-30T00:00:00 to 1981-12-30T00:00:00, with DAY granularity.
2022-10-17 11:05:45.319 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-30T00:00:00.
2022-10-17 11:05:45.320 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:45.336 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:45.337 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:46.106 | INFO   

False

## After "fixing/cleaning" the new batch, continue subsequent assessment of new batches as they arrive at the data platform

In [19]:
th.assess_new_ts(
    df=new_batch_1981_12_30_df,
    ts=datetime.datetime(1981, 12, 30, tzinfo=datetime.timezone.utc),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:46.307 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:46.308 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-30 00:00:00+00:00
2022-10-17 11:05:46.365 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:47.272 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-30T00:00:00 to 1981-12-30T00:00:00, with DAY granularity.
2022-10-17 11:05:47.370 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-30T00:00:00.
2022-10-17 11:05:47.371 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:47.387 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:47.387 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:48.152 | INFO   

True

In [20]:
th.assess_new_ts(
    df=new_batch_1981_12_31_df,
    ts=datetime.datetime(1981, 12, 31, tzinfo=datetime.timezone.utc),
    dataset_uri="temperatures",
    profiling_builder=th.profiler.SimpleProfilingBuilder(),
    session=session,
)

2022-10-17 11:05:48.346 | INFO     | thoth.service_layer:assess_new_ts:444 - Pipeline started 👤 💯 🔍️ ...
2022-10-17 11:05:48.347 | INFO     | thoth.service_layer:assess_new_ts:445 - 1981-12-31 00:00:00+00:00
2022-10-17 11:05:48.401 | INFO     | thoth.profiler:profile:319 - 👤 Profiling started ...
2022-10-17 11:05:49.288 | INFO     | thoth.profiler:profile:331 - Processing 1 timestamps from 1981-12-31T00:00:00 to 1981-12-31T00:00:00, with DAY granularity.
2022-10-17 11:05:49.391 | INFO     | thoth.profiler:_build_report:274 - Finished profiling report for ts=1981-12-31T00:00:00.
2022-10-17 11:05:49.392 | INFO     | thoth.profiler:profile:349 - 👤 Profiling done!
2022-10-17 11:05:49.408 | INFO     | thoth.service_layer:profile:195 - 🧐 You can now check the profiling metrics in the dashboard: http://localhost:8501/?dataset_uri=temperatures&view=%F0%9F%91%A4+Profiling
2022-10-17 11:05:49.411 | INFO     | thoth.anomaly.scoring:score:78 - 💯 Scoring started...
2022-10-17 11:05:50.187 | INFO   

True