This is a very simple example of the components implemented in the AMOS project.
We're simply passing a dataframe `data` through all components in order to show their basic functionality and simulate a pipeline.

In order to run this notebook, you need to create a conda environment as described in the repo's setup guide, and then build and install the sdk
```bash
RTDIP_SDK_NEXT_VER=<version> python -m  build
pip install dist/rtdip_sdk-<version>-py3-none-any.whl
```

In [38]:
# setup for netebook
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark =  SparkSession.builder.master("local[*]").appName("Demo").getOrCreate()
# read csv data
df = spark.read.option("header", "true").csv("test_data.csv")
df = df.withColumn("Value", col("Value").cast("double"))
df = df.withColumn("EventTime", col("EventTime").cast("timestamp"))

# Data manipulation

In [2]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization import NormalizationZScore, NormalizationMinMax, NormalizationMean, Denormalization

ZScoreNormalization = NormalizationZScore(df, ['Value'], in_place=True)
df = ZScoreNormalization.filter()
df = Denormalization(df, ZScoreNormalization).filter()

MinMaxNormalization = NormalizationMinMax(df, ['Value'], in_place=True)
df = MinMaxNormalization.normalize()
df = MinMaxNormalization.denormalize(df)

MeanNormalization = NormalizationMean(df, ['Value'], in_place=True)
df = MeanNormalization.normalize()
df = MeanNormalization.denormalize(df)



In [3]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.duplicate_detection import DuplicateDetection

df = DuplicateDetection(df, ["Value"]).filter()

In [5]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.interval_filtering import IntervalFiltering

df = IntervalFiltering(spark, df, 1, "minutes", "EventTime").filter()

In [6]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import MissingValueImputation

df = MissingValueImputation(spark, df).filter()

                                                                                

In [46]:
from pyspark.sql.functions import col, randn
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.dimensionality_reduction import DimensionalityReduction

df2 = df.withColumn("Value2", col("Value") + randn() * 1e-4)

df = DimensionalityReduction(df2, columns=["Value", "Value2"]).filter()

In [4]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection import KSigmaAnomalyDetection
df = KSigmaAnomalyDetection(spark, df, ['Value']).filter()

In [5]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.out_of_range_value_filter import OutOfRangeValueFilter
df = OutOfRangeValueFilter(df, {
    "R0:Z24WVP.0S10L": {"min": -4, "max": 4, "inclusive_bounds": True},
}).filter()

2025-02-02 23:48:18,620 - CheckValueRanges - INFO - Found 38 rows in 'Value' column for TagName 'R0:Z24WVP.0S10L' out of range.
2025-02-02 23:48:19,195 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 23, 49, 0, 1000), Status='Good', Value=2216.44677734375)
2025-02-02 23:48:19,196 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 23, 47, 0, 1000), Status='Good', Value=2258.5576171875)
2025-02-02 23:48:19,196 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 10, 45, 0, 1000), Status='Good', Value=2263.8955078125)
2025-02-02 23:48:19,196 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 12, 36, 0, 1000), S

In [6]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.gaussian_smoothing import GaussianSmoothing

df = GaussianSmoothing(df, sigma=1.0, id_col="TagName", timestamp_col="EventTime", value_col="Value").filter()

# Monitoring

In [None]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import CheckValueRanges

df = CheckValueRanges(df, {
    "TT33-01M9Z2L9:P20.AIRO5N": {"min": -4, "max": 4, "inclusive_bounds": True},
}).check()

2025-01-27 20:04:06,973 - CheckValueRanges - INFO - Found 55 rows in 'Value' column for TagName 'TT33-01M9Z2L9:P20.AIRO5N' out of range.
INFO:CheckValueRanges:Found 55 rows in 'Value' column for TagName 'TT33-01M9Z2L9:P20.AIRO5N' out of range.
2025-01-27 20:04:07,932 - CheckValueRanges - INFO - Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 14, 31, 10, 337000), Status='Good', Value=19411.0)
INFO:CheckValueRanges:Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 14, 31, 10, 337000), Status='Good', Value=19411.0)
2025-01-27 20:04:07,932 - CheckValueRanges - INFO - Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 2, 58, 10), Status='Good', Value=19398.451171875)
INFO:CheckValueRanges:Out of range row for TagName 'TT33-01M9Z2L9:P

In [None]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import FlatlineDetection

df = FlatlineDetection(df, ["Value"], tolerance_timespan=2).check()

2025-01-27 20:04:09,389 - FlatlineDetection - INFO - No flatlining detected.    
INFO:FlatlineDetection:No flatlining detected.


Flatlined Rows:
+-------+---------+------+-----+-------------------+-----------+
|TagName|EventTime|Status|Value|Value_flatline_flag|Value_group|
+-------+---------+------+-----+-------------------+-----------+
+-------+---------+------+-----+-------------------+-----------+



In [None]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import IdentifyMissingDataInterval

# component uses "EventTime" column
df = IdentifyMissingDataInterval(df, interval='100ms', tolerance='10ms').check()

25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 2

In [20]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import IdentifyMissingDataPattern

# component uses "EventTime" column
df = IdentifyMissingDataPattern(df, [{'second': 0}, {'second': 13}, {'second': 49}]).check()

2025-02-02 23:46:05,810 - IdentifyMissingDataPattern - INFO - Using tolerance: 10.0 ms (0.01 seconds)
2025-02-02 23:46:12,384 - IdentifyMissingDataPattern - INFO - Identified 4378 missing patterns.
2025-02-02 23:46:16,466 - IdentifyMissingDataPattern - INFO - Detected Missing Patterns:
2025-02-02 23:46:16,468 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:08:00.000
2025-02-02 23:46:16,469 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:08:13.000
2025-02-02 23:46:16,469 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:08:49.000
2025-02-02 23:46:16,469 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:09:00.000
2025-02-02 23:46:16,469 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:09:13.000
2025-02-02 23:46:16,469 - IdentifyMissingDataPattern - INFO - Missing Pattern at 2024-01-02 00:09:49.000
2025-02-02 23:46:16,470 - IdentifyMissingDataPattern - INFO - Missing Pattern at 20

# Forecasts

In [18]:
from rtdip_sdk.pipelines.forecasting.spark.arima import ArimaPrediction

count = 100
base = df.limit(count)
forecast = ArimaPrediction(
    df,
    value_name="Value",
    past_data_style=ArimaPrediction.InputStyle.SOURCE_BASED,
    to_extend_name="-4O7LSSAM_3EA02:2GT7E02I_R_MP",
    number_of_data_points_to_analyze=count,
    number_of_data_points_to_predict=int(count / 2),
    order=(3, 0, 0),
    seasonal_order=(3, 0, 0, 62),
    timestamp_name="EventTime",
    source_name="TagName",
    status_name="Status",
).filter()
forecast.show()

+--------------------+--------------------+------+---------+
|             TagName|           EventTime|Status|    Value|
+--------------------+--------------------+------+---------+
|-4O7LSSAM_3EA02:2...|2024-01-02 00:26:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 01:25:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 01:36:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 01:37:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 01:56:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 02:47:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:03:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:21:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:28:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:38:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:52:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 03:56:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|2024-01-02 04:00:...|  Good|7344.0903|
|-4O7LSSAM_3EA02:2...|20

In [23]:
from rtdip_sdk.pipelines.forecasting.spark.auto_arima import ArimaAutoPrediction

count = 100
base = df.limit(count)
forecast = ArimaAutoPrediction(
    base, 
    to_extend_name="-4O7LSSAM_3EA02:2GT7E02I_R_MP",
    number_of_data_points_to_analyze=int(count/2),
    number_of_data_points_to_predict=int(count/2),
    seasonal=True
).filter()

In [None]:
from rtdip_sdk.pipelines.forecasting.spark.linear_regression import LinearRegression
from rtdip_sdk.pipelines.transformers.spark.machine_learning import ColumnsToVector, one_hot_encoding

# machine learning only works for vectors
base = one_hot_encoding.OneHotEncoding(df, "Status").transform()
base = ColumnsToVector(base, ["Value"], "Value", override_col=True).transform()

linearRegression = LinearRegression(base, features_col="Value", label_col="Status_Good")
linearRegression.train(base)

from pyspark.sql.types import StructType, StructField, FloatType
forecast_base = spark.createDataFrame([(3.02,), (4.2,),], StructType([StructField("Value", FloatType(), True)]))
forecast_base = ColumnsToVector(forecast_base, input_cols=["Value"], output_col="Value", override_col=True).transform()

forecast = linearRegression.predict(forecast_base)

25/01/27 20:50:49 WARN Instrumentation: [f8dadf72] regParam is zero, which might cause numerical instability and overfitting.
25/01/27 20:50:49 WARN Instrumentation: [f8dadf72] The standard deviation of the label is zero, so the coefficients will be zeros and the intercept will be the mean of the label; as a result, training is not needed.
                                                                                



In [41]:
from rtdip_sdk.pipelines.forecasting.spark.k_nearest_neighbors import KNearestNeighbors
from pyspark.ml.feature import StandardScaler, VectorAssembler
from rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import  OneHotEncoding

knn_df = VectorAssembler(inputCols=["Value"], outputCol="assembled_features").transform(df)
knn_df = OneHotEncoding(knn_df, "Status").transform()
scaler = StandardScaler(inputCol="assembled_features", outputCol="features", withStd=True, withMean=True)
scaled_df = scaler.fit(knn_df).transform(knn_df)

knn = KNearestNeighbors(
    df=scaled_df,
    features_col="features",
    label_col="Status_Good",
    timestamp_col="EventTime",
    k=3,
    weighted=True,
    distance_metric="combined",
    temporal_weight=0.3
)

train_df, test_df = scaled_df.randomSplit([.5, .5])
knn.train(train_df)
predictions = knn.predict(test_df)

# Transformers

In [42]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import  OneHotEncoding

OneHotEncoding(df, "Status").transform()

DataFrame[TagName: string, EventTime: timestamp, Status: string, Value: double, Status_Good: int, Status_ Good: int]

In [None]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning import ColumnsToVector

base = ColumnsToVector(df, ["Value"], "Value", override_col=True).transform()

DataFrame[TagName: string, EventTime: timestamp, Status: string, Value: vector]

In [None]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning.polynomial_features import PolynomialFeatures

# base comes from previous example as it needs to be a vector
PolynomialFeatures(base, "Value", "ValuePolynomial", poly_degree=3).transform()

DataFrame[TagName: string, EventTime: timestamp, Status: string, Status_Good: int, Value: vector, ValuePolynomial: vector]

