This is a very simple axample of the components implemented in the AMOS project.
We're simply passing a dataframe `data` through all components in order to show their basic functionality and simulate a pipeline.

In order to run this notebook, you need to create a conda environment as described in the repo's setup guide, and then build and install the sdk
```bash
RTDIP_SDK_NEXT_VER=<version> python -m  build
pip install dist/rtdip_sdk-<version>-py3-none-any.whl
```

# Data manipulation

In [80]:
# 
from pyspark.sql import SparkSession

spark =  SparkSession.builder.master("local[*]").appName("Demo").getOrCreate()


In [81]:
# read csv data
df = spark.read.option("header", "true").csv("test_data.csv")

                                                                                

In [82]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization import NormalizationZScore, NormalizationMinMax, NormalizationMean, Denormalization

ZScoreNormalization = NormalizationZScore(df, ['Value'], in_place=True)
df = ZScoreNormalization.filter()
df = Denormalization(df, ZScoreNormalization).filter()

MinMaxNormalization = NormalizationMinMax(df, ['Value'], in_place=True)
df = MinMaxNormalization.normalize()
df = MinMaxNormalization.denormalize(df)

MeanNormalization = NormalizationMean(df, ['Value'], in_place=True)
df = MeanNormalization.normalize()
df = MeanNormalization.denormalize(df)

In [83]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.duplicate_detection import DuplicateDetection

df = DuplicateDetection(df, ["Value"]).filter()

In [84]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.interval_filtering import IntervalFiltering

df = IntervalFiltering(spark, df, 1, "minutes", "EventTime").filter()

In [85]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.missing_value_imputation import MissingValueImputation

df = MissingValueImputation(spark, df).filter()

In [86]:
# TODO: dimensionality reduction

In [87]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection import KSigmaAnomalyDetection
df = KSigmaAnomalyDetection(spark, df, ['Value']).filter()

In [88]:
from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.out_of_range_value_filter import OutOfRangeValueFilter
df = OutOfRangeValueFilter(df, {
    "R0:Z24WVP.0S10L": {"min": -4, "max": 4, "inclusive_bounds": True},
}).filter()

2025-01-27 20:03:59,275 - CheckValueRanges - INFO - Found 153 rows in 'Value' column for TagName 'R0:Z24WVP.0S10L' out of range.
INFO:CheckValueRanges:Found 153 rows in 'Value' column for TagName 'R0:Z24WVP.0S10L' out of range.
2025-01-27 20:03:59,593 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 1, 16, 1, 1000), Status='Good', Value=2305.413330078125)
INFO:CheckValueRanges:Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 1, 16, 1, 1000), Status='Good', Value=2305.413330078125)
2025-01-27 20:03:59,594 - CheckValueRanges - INFO - Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetime.datetime(2024, 1, 2, 1, 28, 1, 1000), Status='Good', Value=2305.181396484375)
INFO:CheckValueRanges:Out of range row for TagName 'R0:Z24WVP.0S10L': Row(TagName='R0:Z24WVP.0S10L', EventTime=datetim

In [89]:
# TODO: gaussian smoothing

# Monitoring

In [90]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import CheckValueRanges

df = CheckValueRanges(df, {
    "TT33-01M9Z2L9:P20.AIRO5N": {"min": -4, "max": 4, "inclusive_bounds": True},
}).check()

2025-01-27 20:04:06,973 - CheckValueRanges - INFO - Found 55 rows in 'Value' column for TagName 'TT33-01M9Z2L9:P20.AIRO5N' out of range.
INFO:CheckValueRanges:Found 55 rows in 'Value' column for TagName 'TT33-01M9Z2L9:P20.AIRO5N' out of range.
2025-01-27 20:04:07,932 - CheckValueRanges - INFO - Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 14, 31, 10, 337000), Status='Good', Value=19411.0)
INFO:CheckValueRanges:Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 14, 31, 10, 337000), Status='Good', Value=19411.0)
2025-01-27 20:04:07,932 - CheckValueRanges - INFO - Out of range row for TagName 'TT33-01M9Z2L9:P20.AIRO5N': Row(TagName='TT33-01M9Z2L9:P20.AIRO5N', EventTime=datetime.datetime(2024, 1, 2, 2, 58, 10), Status='Good', Value=19398.451171875)
INFO:CheckValueRanges:Out of range row for TagName 'TT33-01M9Z2L9:P

In [91]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import FlatlineDetection

df = FlatlineDetection(df, ["Value"], tolerance_timespan=2).check()

2025-01-27 20:04:09,389 - FlatlineDetection - INFO - No flatlining detected.    
INFO:FlatlineDetection:No flatlining detected.


Flatlined Rows:
+-------+---------+------+-----+-------------------+-----------+
|TagName|EventTime|Status|Value|Value_flatline_flag|Value_group|
+-------+---------+------+-----+-------------------+-----------+
+-------+---------+------+-----+-------------------+-----------+



In [92]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import IdentifyMissingDataInterval

# component uses "EventTime" column
df = IdentifyMissingDataInterval(df, interval='100ms', tolerance='10ms').check()

25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 20:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/27 2

In [93]:
from rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import IdentifyMissingDataPattern

# component uses "EventTime" column
df = IdentifyMissingDataPattern(df, [{'second': 0}, {'second': 13}, {'second': 49}]).check()

                                                                                

# Forecasts

In [94]:
from rtdip_sdk.pipelines.forecasting.spark.arima import ArimaPrediction

count = 100
base = df.limit(count)
print(base.count())
forecast = ArimaPrediction(
    df,
    value_name="Value",
    past_data_style=ArimaPrediction.InputStyle.SOURCE_BASED,
    to_extend_name="-4O7LSSAM_3EA02:2GT7E02I_R_MP",
    number_of_data_points_to_analyze=count,
    number_of_data_points_to_predict=int(count / 2),
    order=(3, 0, 0),
    seasonal_order=(3, 0, 0, 62),
    timestamp_name="EventTime",
    source_name="TagName",
    status_name="Status",
).filter()
forecast.show()

                                                                                

100


  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  return get_prediction_index(
                                                                                

+--------------------+--------------------+------+---------+
|             TagName|           EventTime|Status|    Value|
+--------------------+--------------------+------+---------+
|_LT2EPL-9PM0.OROT...|2024-01-02 02:57:...|  Good|19400.285|
|_LT2EPL-9PM0.OROT...|2024-01-02 02:07:...|  Good|19399.516|
|_LT2EPL-9PM0.OROT...|2024-01-02 19:10:...|  Good| 19401.05|
|_LT2EPL-9PM0.OROT...|2024-01-02 19:36:...|  Good|19409.979|
|_LT2EPL-9PM0.OROT...|2024-01-02 06:55:...|  Good|19404.215|
|_LT2EPL-9PM0.OROT...|2024-01-02 14:32:...|  Good|19413.582|
|_LT2EPL-9PM0.OROT...|2024-01-02 18:14:...|  Good|19381.455|
|_LT2EPL-9PM0.OROT...|2024-01-02 23:04:...|  Good|19393.973|
|_LT2EPL-9PM0.OROT...|2024-01-02 16:21:...|  Good|19362.654|
|_LT2EPL-9PM0.OROT...|2024-01-02 22:34:...|  Good|19412.322|
|_LT2EPL-9PM0.OROT...|2024-01-02 12:04:...|  Good|19404.729|
|_LT2EPL-9PM0.OROT...|2024-01-02 06:11:...|  Good|19403.898|
|_LT2EPL-9PM0.OROT...|2024-01-02 05:57:...|  Good| 19403.67|
|_LT2EPL-9PM0.OROT...|20

In [96]:
from rtdip_sdk.pipelines.forecasting.spark.auto_arima import ArimaAutoPrediction

count = 100
base = df.limit(count)
base.show()
forecast = ArimaAutoPrediction(base, to_extend_name='Value', number_of_data_points_to_analyze=int(count/2), number_of_data_points_to_predict=int(count/2), seasonal=True)


                                                                                

+--------------------+--------------------+------+---------+
|             TagName|           EventTime|Status|    Value|
+--------------------+--------------------+------+---------+
|_LT2EPL-9PM0.OROT...|2024-01-02 02:57:...|  Good|19400.285|
|_LT2EPL-9PM0.OROT...|2024-01-02 02:07:...|  Good|19399.516|
|_LT2EPL-9PM0.OROT...|2024-01-02 19:10:...|  Good| 19401.05|
|_LT2EPL-9PM0.OROT...|2024-01-02 19:36:...|  Good|19409.979|
|_LT2EPL-9PM0.OROT...|2024-01-02 06:55:...|  Good|19404.215|
|_LT2EPL-9PM0.OROT...|2024-01-02 14:32:...|  Good|19413.582|
|_LT2EPL-9PM0.OROT...|2024-01-02 18:14:...|  Good|19381.455|
|_LT2EPL-9PM0.OROT...|2024-01-02 23:04:...|  Good|19393.973|
|_LT2EPL-9PM0.OROT...|2024-01-02 16:21:...|  Good|19362.654|
|_LT2EPL-9PM0.OROT...|2024-01-02 22:34:...|  Good|19412.322|
|_LT2EPL-9PM0.OROT...|2024-01-02 12:04:...|  Good|19404.729|
|_LT2EPL-9PM0.OROT...|2024-01-02 06:11:...|  Good|19403.898|
|_LT2EPL-9PM0.OROT...|2024-01-02 05:57:...|  Good| 19403.67|
|_LT2EPL-9PM0.OROT...|20

                                                                                

ValueError: None not found in the DataFrame.



In [122]:
from rtdip_sdk.pipelines.forecasting.spark.linear_regression import LinearRegression
from rtdip_sdk.pipelines.transformers.spark.machine_learning import ColumnsToVector, one_hot_encoding

# machine learning only works for vectors
base = one_hot_encoding.OneHotEncoding(df, "Status").transform()
base = ColumnsToVector(base, ["Value"], "Value", override_col=True).transform()

linearRegression = LinearRegression(base, features_col="Value", label_col="Status_Good")
linearRegression.train(base)

from pyspark.sql.types import StructType, StructField, FloatType
forecast_base = spark.createDataFrame([(3.02,), (4.2,),], StructType([StructField("Value", FloatType(), True)]))
forecast_base = ColumnsToVector(forecast_base, input_cols=["Value"], output_col="Value", override_col=True).transform()

forecast = linearRegression.predict(forecast_base)

25/01/27 20:50:49 WARN Instrumentation: [f8dadf72] regParam is zero, which might cause numerical instability and overfitting.
25/01/27 20:50:49 WARN Instrumentation: [f8dadf72] The standard deviation of the label is zero, so the coefficients will be zeros and the intercept will be the mean of the label; as a result, training is not needed.
                                                                                



In [None]:
# TODO: KNN

# Transformers

In [None]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import  OneHotEncoding

OneHotEncoding(df, "Status").transform()

In [None]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning import ColumnsToVector

base = ColumnsToVector(df, ["Value"], "Value", override_col=True).transform()

DataFrame[TagName: string, EventTime: timestamp, Status: string, Value: vector]

In [None]:
from rtdip_sdk.pipelines.transformers.spark.machine_learning.polynomial_features import PolynomialFeatures

# base comes from previous example as it needs to be a vector
PolynomialFeatures(base, "Value", "ValuePolynomial", poly_degree=3).transform()

DataFrame[TagName: string, EventTime: timestamp, Status: string, Status_Good: int, Value: vector, ValuePolynomial: vector]

