diff --git a/README.md b/README.md index 8ecbdf6..f4dc861 100644 --- a/README.md +++ b/README.md @@ -13,26 +13,26 @@ A machine learning library for assisting in the generation of machine learning problems for wind farms operations data by analyzing past occurrences of events. - | Important Links | | - | ----------------------------------- | -------------------------------------------------------------------- | - | :computer: **[Website]** | Check out the Sintel Website for more information about the project. | - | :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. | - | :star: **[Tutorials]** | Checkout our notebooks | - | :octocat: **[Repository]** | The link to the Github Repository of this library. | - | :scroll: **[License]** | The repository is published under the MIT License. | - | :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. | - | ![][Slack Logo] **[Community]** | Join our Slack Workspace for announcements and discussions. | - - [Website]: https://sintel.dev/ - [Documentation]: https://dtail.gitbook.io/zephyr/ - [Repository]: https://github.com/sintel-dev/Zephyr - [Tutorials]: https://github.com/sintel-dev/Zephyr/blob/master/notebooks - [License]: https://github.com/sintel-dev/Zephyr/blob/master/LICENSE - [Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha - [Community]: https://join.slack.com/t/sintel-space/shared_invite/zt-q147oimb-4HcphcxPfDAM0O9_4PaUtw - [Slack Logo]: https://github.com/sintel-dev/Orion/blob/master/docs/images/slack.png - - - Homepage: https://github.com/signals-dev/zephyr +| Important Links | | +| ----------------------------------- | -------------------------------------------------------------------- | +| :computer: **[Website]** | Check out the Sintel Website for more information about the project. | +| :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. | +| :star: **[Tutorials]** | Checkout our notebooks | +| :octocat: **[Repository]** | The link to the Github Repository of this library. | +| :scroll: **[License]** | The repository is published under the MIT License. | +| :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. | +| ![][Slack Logo] **[Community]** | Join our Slack Workspace for announcements and discussions. | + +[Website]: https://sintel.dev/ +[Documentation]: https://dtail.gitbook.io/zephyr/ +[Repository]: https://github.com/sintel-dev/Zephyr +[Tutorials]: https://github.com/sintel-dev/Zephyr/blob/master/notebooks +[License]: https://github.com/sintel-dev/Zephyr/blob/master/LICENSE +[Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha +[Community]: https://join.slack.com/t/sintel-space/shared_invite/zt-q147oimb-4HcphcxPfDAM0O9_4PaUtw +[Slack Logo]: https://github.com/sintel-dev/Orion/blob/master/docs/images/slack.png + +- Homepage: https://github.com/signals-dev/zephyr # Overview @@ -42,17 +42,17 @@ occurrences of events. The main features of **Zephyr** are: -* **EntitySet creation**: tools designed to represent wind farm data and the relationship -between different tables. We have functions to create EntitySets for datasets with PI data -and datasets using SCADA data. -* **Labeling Functions**: a collection of functions, as well as tools to create custom versions -of them, ready to be used to analyze past operations data in the search for occurrences of -specific types of events in the past. -* **Prediction Engineering**: a flexible framework designed to apply labeling functions on -wind turbine operations data in a number of different ways to create labels for custom -Machine Learning problems. -* **Feature Engineering**: a guide to using Featuretools to apply automated feature engineerinig -to wind farm data. +- **EntitySet creation**: tools designed to represent wind farm data and the relationship + between different tables. We have functions to create EntitySets for datasets with PI data + and datasets using SCADA data. +- **Labeling Functions**: a collection of functions, as well as tools to create custom versions + of them, ready to be used to analyze past operations data in the search for occurrences of + specific types of events in the past. +- **Prediction Engineering**: a flexible framework designed to apply labeling functions on + wind turbine operations data in a number of different ways to create labels for custom + Machine Learning problems. +- **Feature Engineering**: a guide to using Featuretools to apply automated feature engineerinig + to wind farm data. # Install @@ -60,8 +60,7 @@ to wind farm data. **Zephyr** has been developed and runs on Python 3.8, 3.9, 3.10, 3.11 and 3.12. -Also, although it is not strictly required, the usage of a [virtualenv]( -https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering +Also, although it is not strictly required, the usage of a [virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering with other software installed in the system where you are trying to run **Zephyr**. ## Download and Install @@ -79,35 +78,38 @@ If you want to install from source or contribute to the project please read the # Quickstart In this short tutorial we will guide you through a series of steps that will help you -getting started with **Zephyr**. +getting started with **Zephyr**. For more detailed examples, please refer to the tutorial notebooks in the `notebooks` directory: + +- `feature_engineering.ipynb`: Learn how to create EntitySets and perform feature engineering +- `modeling.ipynb`: Learn how to train and evaluate models +- `visualization.ipynb`: Learn how to visualize your data and results ## 1. Loading the data -The first step we will be to use preprocessed data to create an EntitySet. Depending on the -type of data, we will either the `zephyr_ml.create_pidata_entityset` or `zephyr_ml.create_scada_entityset` -functions. +The first step will be to use preprocessed data to create an EntitySet. Depending on the +type of data, we will use either the `generate_entityset` function with `es_type="pidata"`, `es_type="scada"` or `es_type="vibrations"`. **NOTE**: if you cloned the **Zephyr** repository, you will find some demo data inside the -`notebooks/data` folder which has been preprocessed to fit the `create_entityset` data -requirements. +`notebooks/data` folder which has been preprocessed to fit the data requirements. -```python3 +```python import os import pandas as pd -from zephyr_ml import create_scada_entityset +from zephyr_ml import Zephyr data_path = 'notebooks/data' data = { - 'turbines': pd.read_csv(os.path.join(data_path, 'turbines.csv')), - 'alarms': pd.read_csv(os.path.join(data_path, 'alarms.csv')), - 'work_orders': pd.read_csv(os.path.join(data_path, 'work_orders.csv')), - 'stoppages': pd.read_csv(os.path.join(data_path, 'stoppages.csv')), - 'notifications': pd.read_csv(os.path.join(data_path, 'notifications.csv')), - 'scada': pd.read_csv(os.path.join(data_path, 'scada.csv')) + 'turbines': pd.read_csv(os.path.join(data_path, 'turbines.csv')), + 'alarms': pd.read_csv(os.path.join(data_path, 'alarms.csv')), + 'work_orders': pd.read_csv(os.path.join(data_path, 'work_orders.csv')), + 'stoppages': pd.read_csv(os.path.join(data_path, 'stoppages.csv')), + 'notifications': pd.read_csv(os.path.join(data_path, 'notifications.csv')), + 'scada': pd.read_csv(os.path.join(data_path, 'scada.csv')) } -scada_es = create_scada_entityset(data) +zephyr = Zephyr() +scada_es = zephyr.generate_entityset(data, es_type="scada") ``` This will load the turbine, alarms, stoppages, work order, notifications, and SCADA data, and return it @@ -132,15 +134,10 @@ Entityset: SCADA data ## 2. Selecting a Labeling Function -The second step will be to choose an adequate **Labeling Function**. - -We can see the list of available labeling functions using the `zephyr_ml.labeling.get_labeling_functions` -function. - -```python3 -from zephyr_ml import labeling +The second step will be to choose an adequate **Labeling Function**. We can see the list of available labeling functions using the `GET_LABELING_FUNCTIONS` method. -labeling.get_labeling_functions() +```python +labeling_functions = zephyr.GET_LABELING_FUNCTIONS() ``` This will return us a dictionary with the name and a short description of each available @@ -158,14 +155,14 @@ amount of power lost over a slice of time. ## 3. Generate Target Times Once we have loaded the data and the Labeling Function, we are ready to start using -the `zephyr_ml.generate_labels` function to generate a Target Times table. +the `generate_label_times` function to generate a Target Times table. - -```python3 -from zephyr_ml import DataLabeler - -data_labeler = DataLabeler(labeling.labeling_functions.total_power_loss) -target_times, metadata = data_labeler.generate_label_times(scada_es) +```python +target_times, metadata = zephyr.generate_label_times( + labeling_fn="total_power_loss", # or any other labeling function name + num_samples=10, + gap="20d" +) ``` This will return us a `compose.LabelTimes` containing the three columns required to start @@ -177,17 +174,16 @@ working on a Machine Learning problem: the turbine ID (COD_ELEMENT), the cutoff ``` ## 4. Feature Engineering -Using EntitySets and LabelTimes allows us to easily use Featuretools for automatic feature generation. -```python3 -import featuretools as ft +Using EntitySets and LabelTimes allows us to easily use Featuretools for automatic feature generation. -feature_matrix, features = ft.dfs( - entityset=scada_es, - target_dataframe_name='turbines', +```python +feature_matrix, features, _ = zephyr.generate_feature_matrix( + target_dataframe_name="turbines", cutoff_time_in_index=True, - cutoff_time=target_times, - max_features=20 + agg_primitives=["count", "sum", "max"], + max_features = 20, + verbose=True ) ``` @@ -195,48 +191,46 @@ Then we get a list of features and the computed `feature_matrix`. ``` TURBINE_PI_ID TURBINE_LOCAL_ID TURBINE_SAP_COD DES_CORE_ELEMENT SITE DES_CORE_PLANT ... MODE(alarms.COD_STATUS) MODE(alarms.DES_NAME) MODE(alarms.DES_TITLE) NUM_UNIQUE(alarms.COD_ALARM) NUM_UNIQUE(alarms.COD_ALARM_INT) label -COD_ELEMENT time ... +COD_ELEMENT time ... 0 2022-01-01 TA00 A0 LOC000 T00 LOCATION LOC ... Alarm1 Alarm1 Description of alarm 1 1 1 45801.0 [1 rows x 21 columns] ``` - ## 5. Modeling -Once we have the feature matrix, we can train a model using the Zephyr interface where you can train, infer, and evaluate a pipeline. -First, we need to prepare our dataset for training by creating ``X`` and ``y`` variables and one-hot encoding features. +Once we have the feature matrix, we can train a model using the Zephyr interface. First, we need to prepare our dataset for training by creating a train-test split. -```python3 -y = list(feature_matrix.pop('label')) -X = pd.get_dummies(feature_matrix).values +```python +X_train, X_test, y_train, y_test = zephyr.generate_train_test_split( + test_size=0.2, + random_state=42 +) ``` -In this example, we will use an 'xgb' regression pipeline to predict total power loss. - -```python3 -from zephyr_ml import Zephyr +In this example, we will use an 'xgb' regression pipeline to predict total power loss. To train the pipeline, we simply call the `fit_pipeline` method. -pipeline_name = 'xgb_regressor' +```python +zephyr.fit_pipeline( + pipeline="xgb_regressor", + pipeline_hyperparameters=None, -zephyr = Zephyr(pipeline_name) +) ``` -To train the pipeline, we simply use the `fit` function. -```python3 -zephyr.fit(X, y) +After it finished training, we can make predictions using `predict` + +```python +y_pred = zephyr.predict(X_test) ``` -After it finished training, we can make prediciton using `predict` +We can also use `evaluate` to obtain the performance of the pipeline. -```python3 -y_pred = zephyr.predict(X) +```python +results = zephyr.evaluate() ``` -We can also use ``zephyr.evaluate`` to obtain the performance of the pipeline. - # What's Next? If you want to continue learning about **Zephyr** and all its -features please have a look at the tutorials found inside the [notebooks folder]( -https://github.com/signals-dev/zephyr/tree/main/notebooks). +features please have a look at the tutorials found inside the [notebooks folder](https://github.com/signals-dev/zephyr/tree/main/notebooks). diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..1050458 --- /dev/null +++ b/demo.py @@ -0,0 +1,17 @@ +from os import path +import pandas as pd +from zephyr_ml import create_scada_entityset + +data_path = "notebooks/data" + +data = { + "turbines": pd.read_csv(path.join(data_path, "turbines.csv")), + "alarms": pd.read_csv(path.join(data_path, "alarms.csv")), + "work_orders": pd.read_csv(path.join(data_path, "work_orders.csv")), + "stoppages": pd.read_csv(path.join(data_path, "stoppages.csv")), + "notifications": pd.read_csv(path.join(data_path, "notifications.csv")), + "scada": pd.read_csv(path.join(data_path, "scada.csv")), +} +scada_es = create_scada_entityset(data) + +print(scada_es) diff --git a/notebooks/feature_engineering.ipynb b/notebooks/feature_engineering.ipynb index ea3c726..73667ef 100644 --- a/notebooks/feature_engineering.ipynb +++ b/notebooks/feature_engineering.ipynb @@ -6,10 +6,10 @@ "metadata": {}, "source": [ "# Feature Engineering\n", - "In this tutorial, we will show you how to use zephyr_ml to create EntitySets, generate label times, and do automated feature engineering. This tutorial assumes you have a folder with the mostly pre-processed data in seperate CSVs. If necessary, please update the steps and paths below.\n", + "In this tutorial, we will show you how to use `zephyr_ml`'s `Zephyr` class to create EntitySets, generate label times, and do automated feature engineering. This tutorial assumes you have a folder with the mostly pre-processed data in seperate CSVs. If necessary, please update the steps and paths below.\n", "\n", "## 1) Create EntitySet\n", - "zephyr_ml has strict assumptions about the data passed into its `create_pidata_entityset` and `create_scada_entityset` functions. It's the user's responsibility to apply the necessary pre-processing steps to get data into a format acceptable for zephyr_ml. \n", + "zephyr_ml has strict assumptions about the data passed into its `create_entityset` method. It's the user's responsibility to apply the necessary pre-processing steps to get data into a format acceptable for zephyr_ml. \n", "\n", "For example, the demo PI data needs to be converted to a tabular format instead of a `tag` `value` format. The `turbine` column also needs too be renamed to `COD_ELEMENT` to match the rest of the data." ] @@ -191,7 +191,7 @@ { "data": { "text/plain": [ - "Entityset: PI data\n", + "Entityset: pidata\n", " DataFrames:\n", " turbines [Rows: 1, Columns: 10]\n", " alarms [Rows: 2, Columns: 10]\n", @@ -213,8 +213,9 @@ } ], "source": [ - "from zephyr_ml import create_pidata_entityset\n", + "from zephyr_ml import Zephyr\n", "\n", + "zephyr = Zephyr()\n", "data = {\n", " 'turbines': pd.read_csv(path.join(data_path, 'turbines.csv')),\n", " 'alarms': pd.read_csv(path.join(data_path, 'alarms.csv')),\n", @@ -224,7 +225,7 @@ " 'pidata': pidata_df\n", "}\n", "\n", - "pidata_es = create_pidata_entityset(data)\n", + "pidata_es = zephyr.generate_entityset(dfs = data, es_type = \"pidata\")\n", "pidata_es" ] }, @@ -258,6 +259,34 @@ { "cell_type": "code", "execution_count": 5, + "id": "f00c300f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'brake_pad_presence': {'obj': ,\n", + " 'desc': 'Determines if brake pad present in stoppages.'},\n", + " 'converter_replacement_presence': {'obj': ,\n", + " 'desc': 'Calculates the converter replacement presence.'},\n", + " 'gearbox_replace_presence': {'obj': ,\n", + " 'desc': 'Determines if gearbox replacement/exchange is present in stoppages.'},\n", + " 'total_power_loss': {'obj': ,\n", + " 'desc': 'Calculates the total power loss over the data slice.'}}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zephyr.GET_LABELING_FUNCTIONS()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "e0ee16eb", "metadata": {}, "outputs": [ @@ -303,36 +332,47 @@ "0 0 2022-01-01 45801.0" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from zephyr_ml import DataLabeler, labeling\n", - "\n", - "data_labeler = DataLabeler(labeling.total_power_loss)\n", - "\n", - "label_times, _ = data_labeler.generate_label_times(pidata_es)\n", + "label_times, _ = zephyr.generate_label_times(\"total_power_loss\")\n", "label_times" ] }, { "cell_type": "markdown", - "id": "ab8eefd3", + "id": "e26ac0cb", "metadata": {}, "source": [ - "## 3) Feature Engineering with SigPro\n", + "## 3) Feature Engineering with SigPro and Featuretools\n", "\n", - "Process signals with [SigPro](https://github.com/sintel-dev/SigPro) for PI signals or SCADA signals.\n", + "The feature engineering process in zephyr_ml combines signal processing with SigPro and automated feature generation with Featuretools into a single method, `generate_feature_matrix`. This unified approach allows for efficient processing of both time series signals and relational data." + ] + }, + { + "cell_type": "markdown", + "id": "a9a3f3a6", + "metadata": {}, + "source": [ + "### Signal Processing with SigPro\n", + "To perform signal processing in the `generate_feature_matrix` method, we pass in the following parameters:\n", + "- `signal_aggregations`: the specifications of the aggregation primitives\n", + "- `signal_transformations`: the specifications of the transformation priimitives\n", + "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", + "- `signal_column`: the name of the signal column in the dataframe.\n", + "- `signal_window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", + "- `signal_replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one.\n", "\n", - "Processing signals is done by specifying the `transformations` and `aggregations` we wish to apply to the data. To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." + "To look at some of the primitives readily available, we use `get_primitives` function from `SigPro`." ] }, { "cell_type": "code", - "execution_count": 6, - "id": "191a123a", + "execution_count": 7, + "id": "5dfdd53e", "metadata": {}, "outputs": [ { @@ -352,11 +392,12 @@ " 'sigpro.transformations.frequency.band.frequency_band',\n", " 'sigpro.transformations.frequency.fft.fft',\n", " 'sigpro.transformations.frequency.fft.fft_real',\n", + " 'sigpro.transformations.frequency.fftfreq.fft_freq',\n", " 'sigpro.transformations.frequency_time.stft.stft',\n", " 'sigpro.transformations.frequency_time.stft.stft_real']" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -369,7 +410,7 @@ }, { "cell_type": "markdown", - "id": "5b23aff6", + "id": "586cb217", "metadata": {}, "source": [ "Suppose we are interested in finding the amplitude mean for each month of readings in the signal. We first specify the `name` and respective `primitive` we want to apply for both `transformations` and `aggregations`.\n", @@ -379,17 +420,17 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "961af0ef", + "execution_count": 8, + "id": "bd00c9fc", "metadata": {}, "outputs": [], "source": [ - "aggregations = [{\n", + "signal_aggregations = [{\n", " \"name\":\"mean\",\n", " \"primitive\":\"sigpro.aggregations.amplitude.statistical.mean\"\n", "}]\n", "\n", - "transformations = [{\n", + "signal_transformations = [{\n", " \"name\":\"fft\",\n", " \"primitive\":\"sigpro.transformations.amplitude.identity.identity\"\n", "}]" @@ -397,33 +438,83 @@ }, { "cell_type": "markdown", - "id": "a9a3f3a6", + "id": "2520a27e", "metadata": {}, "source": [ - "We use `process_signals` function to accomplish our goal. We pass the following:\n", - "- `es`: the entityset we are working with.\n", - "- `signal_dataframe_name`: the name of the dataframe whether `pidata` or `scada`.\n", - "- `signal_column`: the name of the signal column in the dataframe.\n", - "- `window_size`: the size of the bin we want to process the signals over, e.g. each month.\n", - "- `replace_dataframe`: an indicator whether we want to replace the current dataframe or add it as a new one." + "### Automated Feature Generation with Featuretools\n", + "The `generate_feature_matrix` method also leverages Featuretools to automatically generate features from the previously generated EntitySet and use label times as cutoff times, ensuring temporal validity. For example, we can set interesting categorical values in our EntitySet and use them to generate aggregation features grouped by those interesting values. We can also set which primitives we want to use and control which columns and entities those primitives can be applied to. " ] }, { "cell_type": "code", - "execution_count": 8, - "id": "bea94368", + "execution_count": 9, + "id": "2a14d02c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n", + "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3464: RuntimeWarning: Mean of empty slice.\n", " return _methods._mean(a, axis=axis, dtype=dtype,\n", - "/Users/sarah/anaconda3/envs/Zephyr/lib/python3.8/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n", + "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/numpy/core/_methods.py:192: RuntimeWarning: invalid value encountered in scalar divide\n", " ret = ret.dtype.type(ret / rcount)\n" ] - }, + } + ], + "source": [ + "feature_matrix, features, processed_es =zephyr.generate_feature_matrix(\n", + " # signal processing parameters\n", + " signal_dataframe_name = \"pidata\",\n", + " signal_column = \"val1\",\n", + " signal_transformations = signal_transformations,\n", + " signal_aggregations = signal_aggregations,\n", + " signal_window_size = \"1m\",\n", + " signal_replace_dataframe = False,\n", + " \n", + " # feature generation parameters\n", + " target_dataframe_name = \"turbines\", \n", + " cutoff_time_in_index=True,\n", + " where_primitives=['count', 'sum'],\n", + " agg_primitives=['count', 'min', 'max', 'sum'],\n", + " trans_primitives=['num_words'],\n", + " ignore_dataframes=['notifications', 'work_orders'],\n", + " add_interesting_values = True,\n", + " interesting_dataframe_name = \"alarms\",\n", + " interesting_values = {'DES_NAME': ['Alarm1', 'Alarm2']}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7a77caa2", + "metadata": {}, + "source": [ + "`generate_feature_matrix` returns three outputs: `feature_matrix`, `features`, and `processed_es`. `processed_es` is a deepcopy of our Zephyr instance's original generated entityset, containing the signal processing and interesting values. `feature_matrix` is the generated feature matrix and `features` is a list of the generated features. " + ] + }, + { + "cell_type": "markdown", + "id": "d8474fa4", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "3950f656", + "metadata": {}, + "source": [ + "Based on our original observations of `val1`, we now have `pidata_processed` with an entry for each month and the respective mean value of observations we see in that month.\n", + "\n", + "**Note**: in the months we don't have observations, the value becomes null." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bea94368", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -484,50 +575,19 @@ "2 2 0 2022-03-31 559" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from zephyr_ml.feature_engineering import process_signals\n", - "\n", - "process_signals(es=pidata_es, \n", - " signal_dataframe_name='pidata', \n", - " signal_column='val1', \n", - " transformations=transformations, \n", - " aggregations=aggregations,\n", - " window_size='1m', \n", - " replace_dataframe=False)\n", - "\n", - "pidata_es['pidata_processed']" - ] - }, - { - "cell_type": "markdown", - "id": "fd88812a", - "metadata": {}, - "source": [ - "Based on our original observations of `val1`, we now have `pidata_processed` with an entry for each month and the respective mean value of observations we see in that month.\n", - "\n", - "**Note**: in the months we don't have observations, the value becomes null." - ] - }, - { - "cell_type": "markdown", - "id": "5aacf99b", - "metadata": {}, - "source": [ - "## 4) Feature Engineering with Featuretools\n", - "Using EntitySets and LabelTimes allows us to easily use Featuretools for automatic feature generation. For example, we can set interesting categorical values in our EntitySet and use them to generate aggregation features grouped by those interesting values. We can also set which primitives we want to use and control which columns and entities those primitives can be applied to. Featuretools can also use label times as cutoff times, ensuring that data after the label times is not used in feature generation. \n", - "\n", - "For additonal help using Featuretools, please see the documentation: https://featuretools.alteryx.com/en/stable/index.html" + "processed_es[\"pidata_processed\"]" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "ee020300", + "execution_count": 11, + "id": "be788aaf", "metadata": {}, "outputs": [ { @@ -567,10 +627,10 @@ " ,\n", " ,\n", " ,\n", - " ,\n", " ,\n", - " ,\n", + " ,\n", " ,\n", + " ,\n", " ,\n", " ,\n", " ,\n", @@ -582,35 +642,19 @@ " ]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import featuretools as ft\n", - "\n", - "interesting_alarms = ['Alarm1', 'Alarm2']\n", - "pidata_es.add_interesting_values(dataframe_name='alarms', values={'DES_NAME': interesting_alarms})\n", - "\n", - "feature_matrix, features = ft.dfs(\n", - " entityset=pidata_es,\n", - " target_dataframe_name='turbines',\n", - " cutoff_time_in_index=True,\n", - " cutoff_time=label_times,\n", - " where_primitives=['count', 'sum'],\n", - " agg_primitives=['count', 'min', 'max', 'sum'],\n", - " trans_primitives=['num_words'],\n", - " ignore_dataframes=['notifications', 'work_orders'] \n", - ")\n", - "\n", "features" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "bdce0acf", + "execution_count": 12, + "id": "d5b00ee9", "metadata": {}, "outputs": [ { @@ -635,26 +679,26 @@ " \n", " \n", " \n", - " TURBINE_PI_ID\n", - " TURBINE_LOCAL_ID\n", - " TURBINE_SAP_COD\n", - " DES_CORE_ELEMENT\n", - " SITE\n", - " DES_CORE_PLANT\n", - " COD_PLANT_SAP\n", - " PI_COLLECTOR_SITE_NAME\n", - " PI_LOCAL_SITE_NAME\n", " COUNT(alarms)\n", + " MAX(alarms.IND_DURATION)\n", + " MIN(alarms.IND_DURATION)\n", + " SUM(alarms.IND_DURATION)\n", + " COUNT(stoppages)\n", + " MAX(stoppages.COD_WO)\n", + " MAX(stoppages.IND_DURATION)\n", + " MAX(stoppages.IND_LOST_GEN)\n", + " MIN(stoppages.COD_WO)\n", + " MIN(stoppages.IND_DURATION)\n", " ...\n", - " MAX(stoppages.NUM_WORDS(DES_COMMENTS))\n", - " MAX(stoppages.NUM_WORDS(DES_DESCRIPTION))\n", - " MAX(stoppages.NUM_WORDS(DES_WO_NAME))\n", - " MIN(stoppages.NUM_WORDS(DES_COMMENTS))\n", - " MIN(stoppages.NUM_WORDS(DES_DESCRIPTION))\n", - " MIN(stoppages.NUM_WORDS(DES_WO_NAME))\n", - " SUM(stoppages.NUM_WORDS(DES_COMMENTS))\n", - " SUM(stoppages.NUM_WORDS(DES_DESCRIPTION))\n", - " SUM(stoppages.NUM_WORDS(DES_WO_NAME))\n", + " TURBINE_PI_ID_TA00\n", + " TURBINE_LOCAL_ID_A0\n", + " TURBINE_SAP_COD_LOC000\n", + " DES_CORE_ELEMENT_T00\n", + " SITE_LOCATION\n", + " DES_CORE_PLANT_LOC\n", + " COD_PLANT_SAP_ABC\n", + " PI_COLLECTOR_SITE_NAME_LOC0\n", + " PI_LOCAL_SITE_NAME_LOC0\n", " label\n", " \n", " \n", @@ -687,26 +731,26 @@ " \n", " 0\n", " 2022-01-01\n", - " TA00\n", - " A0\n", - " LOC000\n", - " T00\n", - " LOCATION\n", - " LOC\n", - " ABC\n", - " LOC0\n", - " LOC0\n", " 1\n", + " NaN\n", + " NaN\n", + " 0.0\n", + " 1\n", + " 12345.0\n", + " NaN\n", + " NaN\n", + " 12345.0\n", + " NaN\n", " ...\n", - " 4.0\n", - " 2.0\n", - " 3.0\n", - " 4.0\n", - " 2.0\n", - " 3.0\n", - " 4.0\n", - " 2.0\n", - " 3.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", " 45801.0\n", " \n", " \n", @@ -715,62 +759,54 @@ "" ], "text/plain": [ - " TURBINE_PI_ID TURBINE_LOCAL_ID TURBINE_SAP_COD \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 TA00 A0 LOC000 \n", + " COUNT(alarms) MAX(alarms.IND_DURATION) \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 1 NaN \n", "\n", - " DES_CORE_ELEMENT SITE DES_CORE_PLANT \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 T00 LOCATION LOC \n", + " MIN(alarms.IND_DURATION) SUM(alarms.IND_DURATION) \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 NaN 0.0 \n", "\n", - " COD_PLANT_SAP PI_COLLECTOR_SITE_NAME \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 ABC LOC0 \n", - "\n", - " PI_LOCAL_SITE_NAME COUNT(alarms) ... \\\n", - "COD_ELEMENT time ... \n", - "0 2022-01-01 LOC0 1 ... \n", + " COUNT(stoppages) MAX(stoppages.COD_WO) \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 1 12345.0 \n", "\n", - " MAX(stoppages.NUM_WORDS(DES_COMMENTS)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 4.0 \n", + " MAX(stoppages.IND_DURATION) \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 NaN \n", "\n", - " MAX(stoppages.NUM_WORDS(DES_DESCRIPTION)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 2.0 \n", + " MAX(stoppages.IND_LOST_GEN) MIN(stoppages.COD_WO) \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 NaN 12345.0 \n", "\n", - " MAX(stoppages.NUM_WORDS(DES_WO_NAME)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 3.0 \n", + " MIN(stoppages.IND_DURATION) ... TURBINE_PI_ID_TA00 \\\n", + "COD_ELEMENT time ... \n", + "0 2022-01-01 NaN ... 1 \n", "\n", - " MIN(stoppages.NUM_WORDS(DES_COMMENTS)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 4.0 \n", + " TURBINE_LOCAL_ID_A0 TURBINE_SAP_COD_LOC000 \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 1 1 \n", "\n", - " MIN(stoppages.NUM_WORDS(DES_DESCRIPTION)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 2.0 \n", + " DES_CORE_ELEMENT_T00 SITE_LOCATION \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 1 1 \n", "\n", - " MIN(stoppages.NUM_WORDS(DES_WO_NAME)) \\\n", + " DES_CORE_PLANT_LOC COD_PLANT_SAP_ABC \\\n", "COD_ELEMENT time \n", - "0 2022-01-01 3.0 \n", + "0 2022-01-01 1 1 \n", "\n", - " SUM(stoppages.NUM_WORDS(DES_COMMENTS)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 4.0 \n", + " PI_COLLECTOR_SITE_NAME_LOC0 PI_LOCAL_SITE_NAME_LOC0 \\\n", + "COD_ELEMENT time \n", + "0 2022-01-01 1 1 \n", "\n", - " SUM(stoppages.NUM_WORDS(DES_DESCRIPTION)) \\\n", - "COD_ELEMENT time \n", - "0 2022-01-01 2.0 \n", - "\n", - " SUM(stoppages.NUM_WORDS(DES_WO_NAME)) label \n", - "COD_ELEMENT time \n", - "0 2022-01-01 3.0 45801.0 \n", + " label \n", + "COD_ELEMENT time \n", + "0 2022-01-01 45801.0 \n", "\n", "[1 rows x 48 columns]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -781,11 +817,8 @@ } ], "metadata": { - "interpreter": { - "hash": "2d6fabd7bf745a21519616ebdce3b2479184204dadf576aa19f086ff78438203" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -799,7 +832,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.0" } }, "nbformat": 4, diff --git a/notebooks/modeling.ipynb b/notebooks/modeling.ipynb index 69d26b3..4534722 100644 --- a/notebooks/modeling.ipynb +++ b/notebooks/modeling.ipynb @@ -7,7 +7,7 @@ "source": [ "# Modeling\n", "\n", - "In this tutorial, we will show you how to use `zephyr_ml` to train models using the `Zephyr` class. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n", + "In this tutorial, we will show you how to use `zephyr_ml`'s `Zephyr` class to train models. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n", "\n", "## 1) Load the Feature Matrix\n", "\n", @@ -19,58 +19,572 @@ "execution_count": 1, "id": "4a6724ad", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
COUNT(alarms)MAX(alarms.IND_DURATION)MIN(alarms.IND_DURATION)SUM(alarms.IND_DURATION)COUNT(stoppages)MAX(stoppages.COD_WO)MAX(stoppages.IND_DURATION)MAX(stoppages.IND_LOST_GEN)MIN(stoppages.COD_WO)MIN(stoppages.IND_DURATION)...DES_CORE_ELEMENT_T12DES_CORE_ELEMENT_T13DES_CORE_ELEMENT_T14DES_CORE_ELEMENT_T15SITE_LOCATIONDES_CORE_PLANT_LOCCOD_PLANT_SAP_ABCCOD_PLANT_SAP_XYZPI_COLLECTOR_SITE_NAME_LOC0PI_LOCAL_SITE_NAME_LOC0
01NaNNaN0.0112345.0NaNNaN12345.0NaN...0000111011
10NaNNaN0.0137452.0NaNNaN37452.0NaN...0000111011
20NaNNaN0.0123432.0NaNNaN23432.0NaN...0000111011
30NaNNaN0.0112452.0NaNNaN12452.0NaN...0000111011
40NaNNaN0.0132435.0NaNNaN32435.0NaN...0000111011
50NaNNaN0.0123534.0NaNNaN23534.0NaN...0000111011
60NaNNaN0.0165431.0NaNNaN65431.0NaN...0000111011
70NaNNaN0.0135742.0NaNNaN35742.0NaN...0000110111
80NaNNaN0.0121343.0NaNNaN21343.0NaN...0000110111
90NaNNaN0.0143565.0NaNNaN43565.0NaN...0000110111
100NaNNaN0.0124525.0NaNNaN24525.0NaN...0100110111
110NaNNaN0.0167432.0NaNNaN67432.0NaN...0010110111
120NaNNaN0.0121342.0NaNNaN21342.0NaN...0001110111
\n", + "

13 rows × 101 columns

\n", + "
" + ], + "text/plain": [ + " COUNT(alarms) MAX(alarms.IND_DURATION) MIN(alarms.IND_DURATION) \\\n", + "0 1 NaN NaN \n", + "1 0 NaN NaN \n", + "2 0 NaN NaN \n", + "3 0 NaN NaN \n", + "4 0 NaN NaN \n", + "5 0 NaN NaN \n", + "6 0 NaN NaN \n", + "7 0 NaN NaN \n", + "8 0 NaN NaN \n", + "9 0 NaN NaN \n", + "10 0 NaN NaN \n", + "11 0 NaN NaN \n", + "12 0 NaN NaN \n", + "\n", + " SUM(alarms.IND_DURATION) COUNT(stoppages) MAX(stoppages.COD_WO) \\\n", + "0 0.0 1 12345.0 \n", + "1 0.0 1 37452.0 \n", + "2 0.0 1 23432.0 \n", + "3 0.0 1 12452.0 \n", + "4 0.0 1 32435.0 \n", + "5 0.0 1 23534.0 \n", + "6 0.0 1 65431.0 \n", + "7 0.0 1 35742.0 \n", + "8 0.0 1 21343.0 \n", + "9 0.0 1 43565.0 \n", + "10 0.0 1 24525.0 \n", + "11 0.0 1 67432.0 \n", + "12 0.0 1 21342.0 \n", + "\n", + " MAX(stoppages.IND_DURATION) MAX(stoppages.IND_LOST_GEN) \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "5 NaN NaN \n", + "6 NaN NaN \n", + "7 NaN NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "10 NaN NaN \n", + "11 NaN NaN \n", + "12 NaN NaN \n", + "\n", + " MIN(stoppages.COD_WO) MIN(stoppages.IND_DURATION) ... \\\n", + "0 12345.0 NaN ... \n", + "1 37452.0 NaN ... \n", + "2 23432.0 NaN ... \n", + "3 12452.0 NaN ... \n", + "4 32435.0 NaN ... \n", + "5 23534.0 NaN ... \n", + "6 65431.0 NaN ... \n", + "7 35742.0 NaN ... \n", + "8 21343.0 NaN ... \n", + "9 43565.0 NaN ... \n", + "10 24525.0 NaN ... \n", + "11 67432.0 NaN ... \n", + "12 21342.0 NaN ... \n", + "\n", + " DES_CORE_ELEMENT_T12 DES_CORE_ELEMENT_T13 DES_CORE_ELEMENT_T14 \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "5 0 0 0 \n", + "6 0 0 0 \n", + "7 0 0 0 \n", + "8 0 0 0 \n", + "9 0 0 0 \n", + "10 0 1 0 \n", + "11 0 0 1 \n", + "12 0 0 0 \n", + "\n", + " DES_CORE_ELEMENT_T15 SITE_LOCATION DES_CORE_PLANT_LOC \\\n", + "0 0 1 1 \n", + "1 0 1 1 \n", + "2 0 1 1 \n", + "3 0 1 1 \n", + "4 0 1 1 \n", + "5 0 1 1 \n", + "6 0 1 1 \n", + "7 0 1 1 \n", + "8 0 1 1 \n", + "9 0 1 1 \n", + "10 0 1 1 \n", + "11 0 1 1 \n", + "12 1 1 1 \n", + "\n", + " COD_PLANT_SAP_ABC COD_PLANT_SAP_XYZ PI_COLLECTOR_SITE_NAME_LOC0 \\\n", + "0 1 0 1 \n", + "1 1 0 1 \n", + "2 1 0 1 \n", + "3 1 0 1 \n", + "4 1 0 1 \n", + "5 1 0 1 \n", + "6 1 0 1 \n", + "7 0 1 1 \n", + "8 0 1 1 \n", + "9 0 1 1 \n", + "10 0 1 1 \n", + "11 0 1 1 \n", + "12 0 1 1 \n", + "\n", + " PI_LOCAL_SITE_NAME_LOC0 \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "5 1 \n", + "6 1 \n", + "7 1 \n", + "8 1 \n", + "9 1 \n", + "10 1 \n", + "11 1 \n", + "12 1 \n", + "\n", + "[13 rows x 101 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", - "feature_matrix = pd.read_csv('data/feature_matrix.csv')" + "feature_matrix = pd.read_csv('data/feature_matrix.csv')\n", + "feature_matrix" ] }, { "cell_type": "markdown", - "id": "02e2c90a", + "id": "2be92488", "metadata": {}, "source": [ "## 2) Preparing Model Inputs\n", "\n", - "Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, create one-hot encodings for categorical values, etc.\n", + "Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, etc.\n", "\n", "In this part of the notebook, we do the following:\n", "* create `X` and `y` variables from the feature matrix\n", "* impute missing values using a SimpleImpute\n", - "* split the data into training and testing" + "* pass the data into our `Zephyr` instance and split the data into training and testing" ] }, { "cell_type": "code", "execution_count": 2, - "id": "20da6581", + "id": "b3be626a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n", + " warnings.warn(\n", + "Performing set_feature_matrix. You are skipping the following steps:\n", + "0. generate_entityset or set_entityset\n", + "1. generate_label_times or set_label_times\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[True, False, True, False, True, False, False, False, False, True, False, False, False]\n" + ] + } + ], "source": [ "from sklearn.impute import SimpleImputer\n", - "from sklearn.model_selection import train_test_split\n", + "from zephyr_ml import Zephyr\n", "\n", "# pop the target labels\n", "y = list(feature_matrix.pop('label'))\n", + "print(y)\n", "X = feature_matrix.values\n", "\n", + "\n", "# impute missing values\n", "imputer = SimpleImputer()\n", - "X = imputer.fit_transform(X)\n", + "X = pd.DataFrame(imputer.fit_transform(X))\n", "\n", - "# create train and test splits\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)" + "zephyr = Zephyr()\n", + "zephyr.set_feature_matrix(X, labels = y)\n", + "X_train, X_test, y_train, y_test = zephyr.generate_train_test_split(test_size=0.2, random_state=33)" ] }, { "cell_type": "markdown", - "id": "32afe1aa", + "id": "3c8b00e2", "metadata": {}, "source": [ "## 3) Train a Model\n", "\n", - "We train a model using the `Zephyr` interface where you can train, infer, and evaluate a pipeline.\n", + "We train a model using `Zephyr`'s `fit_pipeline` method.\n", "In this notebook, we use an `xgb_classifier` pipeline which consists of two primitives:\n", "\n", "```\n", @@ -80,115 +594,163 @@ "\n", "An `XGBClassifier` primitive is an XGB model that returns the probability of each class, and `FindThreshold` primitive creates binary labels from the output of the XGB model by choosing a threshold that produces the best metric value (F1 Score by default)\n", "\n", - "To use a pipeline, we simply pass the name of the pipeline to `Zephyr`\n", + "To use a pipeline, we simply pass the name of the pipeline to our `Zephyr` instance.\n", "Optionally, you can change the default settings of the primitive by passing a hyperparameter dictionary. For example, we can change the number of trees in the classifier to be 50 instead of the default value (100)." ] }, { "cell_type": "code", "execution_count": 3, - "id": "b02986d9", + "id": "edffee03", "metadata": {}, "outputs": [], "source": [ - "from zephyr_ml import Zephyr\n", - "\n", "hyperparameters = {\n", " \"xgboost.XGBClassifier#1\": {\n", " \"n_estimators\": 50\n", " }\n", "}\n", "\n", - "zephyr = Zephyr('xgb_classifier', hyperparameters)" + "zephyr.fit_pipeline(pipeline = \"xgb_classifier\", pipeline_hyperparameters = hyperparameters)" ] }, { "cell_type": "markdown", - "id": "a1297396", + "id": "445afd22", "metadata": {}, "source": [ - "Then, training a pipeline can be done using the `fit` function and passing the training data" + "Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function\n" ] }, { "cell_type": "code", "execution_count": 4, - "id": "442c5258", + "id": "78187756", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 0, 1]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "zephyr.fit(X_train, y_train)" + "zephyr.predict()" ] }, { "cell_type": "markdown", - "id": "8d4bf2cc", + "id": "24cda971", "metadata": {}, "source": [ - "Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function" + "Lastly, we can evaluate the performance of the pipeline using `evaluate` function\n" ] }, { "cell_type": "code", "execution_count": 5, - "id": "83814cd8", + "id": "cd097853", + "metadata": {}, + "outputs": [], + "source": [ + "res = zephyr.evaluate()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8df0f26c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[1, 0, 1]" + "{'sklearn.metrics.accuracy_score': 0.6666666666666666,\n", + " 'sklearn.metrics.precision_score': 0.5,\n", + " 'sklearn.metrics.f1_score': 0.6666666666666666,\n", + " 'sklearn.metrics.recall_score': 1.0,\n", + " 'zephyr_ml.primitives.postprocessing.confusion_matrix': (array([[1, 1],\n", + " [0, 1]]),\n", + "
),\n", + " 'zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve': (0.5,\n", + "
)}" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zephyr.predict(X_test)" + "res" ] }, { "cell_type": "markdown", - "id": "15f257eb", + "id": "e2657da3", "metadata": {}, "source": [ - "Lastly, we can evaluate the performance of the pipeline using `evaluate` function" + "The `confusion_matrix` and `roc_auc_score_and_curve` evaluation metrics return some `matplotlib.figure.Figure` objects, which we can display, as shown below." ] }, { "cell_type": "code", - "execution_count": 6, - "id": "191a123a", + "execution_count": 7, + "id": "b74c3618", "metadata": {}, "outputs": [ { "data": { + "image/png": "", "text/plain": [ - "accuracy 0.666667\n", - "f1 0.666667\n", - "recall 1.000000\n", - "precision 0.500000\n", - "dtype: float64" + "
" ] }, - "execution_count": 6, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%matplotlib inline\n", + "_, conf_matrix_fig = res[\"zephyr_ml.primitives.postprocessing.confusion_matrix\"]\n", + "conf_matrix_fig" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d59e86b1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zephyr.evaluate(X_test, y_test)" + "\n", + "_, roc_fig = res[\"zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve\"]\n", + "\n", + "roc_fig\n" ] } ], "metadata": { - "interpreter": { - "hash": "2d6fabd7bf745a21519616ebdce3b2479184204dadf576aa19f086ff78438203" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -202,7 +764,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.0" } }, "nbformat": 4, diff --git a/notebooks/visualization.ipynb b/notebooks/visualization.ipynb index 4a6091d..da2627b 100644 --- a/notebooks/visualization.ipynb +++ b/notebooks/visualization.ipynb @@ -7,7 +7,7 @@ "source": [ "# Visualization\n", "\n", - "In this tutorial, we will show you how to use Zephyr class to obtain intermediate results of the pipeline for visualization purposes. To know more about pipelines and Zephyr class please refer to the modeling notebook. We also used a demo feature matrix, to know how you can create features, please refer to feature_engineering notebook.\n", + "In this tutorial, we will show you how to use Zephyr class to obtain intermediate results of the pipeline for visualization purposes during the fitting stage. To know more about pipelines and Zephyr class please refer to the modeling notebook. We also used a demo feature matrix, to know how you can create features, please refer to feature_engineering notebook.\n", "\n", "## Load the Feature Matrix\n", "\n", @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "id": "d6f954db", "metadata": {}, "outputs": [], @@ -33,18 +33,91 @@ "source": [ "## Prepare data\n", "\n", - "Prepare the data for training by creating a `y` variable to hold the labels, imputing missing values, and normlizing the data." + "Prepare the data for training by creating a `y` variable to hold the labels, imputing missing values, and normlizing the data. We then initialize a `Zephyr` instance, set our data, and split it into training and testing." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "id": "23ec49dd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n", + " warnings.warn(\n", + "Performing set_feature_matrix. You are skipping the following steps:\n", + "0. generate_entityset or set_entityset\n", + "1. generate_label_times or set_label_times\n" + ] + }, + { + "data": { + "text/plain": [ + "( 0 1 2 3 4 5 6 7 8 9 ... \\\n", + " 10 -0.288675 0.0 0.0 -0.463185 -0.463185 -0.463185 0.0 0.0 0.0 0.0 ... \n", + " 5 -0.288675 0.0 0.0 -0.521570 -0.521570 -0.521570 0.0 0.0 0.0 0.0 ... \n", + " 3 -0.288675 0.0 0.0 -1.174466 -1.174466 -1.174466 0.0 0.0 0.0 0.0 ... \n", + " 11 -0.288675 0.0 0.0 2.064680 2.064680 2.064680 0.0 0.0 0.0 0.0 ... \n", + " 1 -0.288675 0.0 0.0 0.298409 0.298409 0.298409 0.0 0.0 0.0 0.0 ... \n", + " 9 -0.288675 0.0 0.0 0.658556 0.658556 0.658556 0.0 0.0 0.0 0.0 ... \n", + " 2 -0.288675 0.0 0.0 -0.527579 -0.527579 -0.527579 0.0 0.0 0.0 0.0 ... \n", + " 8 -0.288675 0.0 0.0 -0.650653 -0.650653 -0.650653 0.0 0.0 0.0 0.0 ... \n", + " 7 -0.288675 0.0 0.0 0.197664 0.197664 0.197664 0.0 0.0 0.0 0.0 ... \n", + " 4 -0.288675 0.0 0.0 0.002832 0.002832 0.002832 0.0 0.0 0.0 0.0 ... \n", + " \n", + " 80 81 82 83 84 85 86 87 88 89 \n", + " 10 0.0 3.464102 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 5 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " 3 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " 11 0.0 -0.288675 3.464102 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 1 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " 9 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 2 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " 8 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 7 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 4 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " \n", + " [10 rows x 90 columns],\n", + " 0 1 2 3 4 5 6 7 8 9 ... \\\n", + " 6 -0.288675 0.0 0.0 1.946791 1.946791 1.946791 0.0 0.0 0.0 0.0 ... \n", + " 12 -0.288675 0.0 0.0 -0.650711 -0.650711 -0.650711 0.0 0.0 0.0 0.0 ... \n", + " 0 3.464102 0.0 0.0 -1.180770 -1.180770 -1.180770 0.0 0.0 0.0 0.0 ... \n", + " \n", + " 80 81 82 83 84 85 86 87 88 89 \n", + " 6 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " 12 0.0 -0.288675 -0.288675 3.464102 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n", + " 0 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n", + " \n", + " [3 rows x 90 columns],\n", + " 10 False\n", + " 5 False\n", + " 3 False\n", + " 11 False\n", + " 1 False\n", + " 9 True\n", + " 2 True\n", + " 8 False\n", + " 7 False\n", + " 4 True\n", + " Name: label, dtype: bool,\n", + " 6 False\n", + " 12 False\n", + " 0 True\n", + " Name: label, dtype: bool)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler\n", + "from zephyr_ml import Zephyr\n", "\n", "# pop the target labels\n", "y = list(feature_matrix.pop('label'))\n", @@ -56,7 +129,12 @@ "\n", "# normalize the data\n", "scaler = StandardScaler()\n", - "X = scaler.fit_transform(X)" + "X = pd.DataFrame(scaler.fit_transform(X))\n", + "\n", + "zephyr = Zephyr()\n", + "zephyr.set_feature_matrix(feature_matrix=X, labels = y)\n", + "zephyr.generate_train_test_split(test_size=0.2, random_state=33)\n", + "\n" ] }, { @@ -78,25 +156,13 @@ " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.threshold\"\n", " },\n", " {\n", - " \"name\": \"predictions\",\n", - " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.predictions\"\n", + " \"name\": \"scores\",\n", + " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.scores\"\n", " }\n", "]\n", "```\n", "\n", - "Where we have a _name_ and a _variable_ defining the intermediate outputs." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "24511f3c", - "metadata": {}, - "outputs": [], - "source": [ - "from zephyr_ml import Zephyr\n", - "\n", - "zephyr = Zephyr('xgb_classifier')" + "Where we have a _name_ and a _variable_ defining the intermediate outputs. " ] }, { @@ -106,22 +172,22 @@ "source": [ "## Visualize\n", "\n", - "When training the pipeline using the `fit` function, you can specify `zephyr.fit(.., visual=True)` to indicate you are interested in obtaining the intermediate outputs." + "When training the pipeline using the `fit` function, you can specify `zephyr.fit_pipeline(.., visual=True)` to indicate you are interested in obtaining the intermediate outputs." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "683393df", "metadata": {}, "outputs": [], "source": [ - "output = zephyr.fit(X, y, visual=True)" + "output = zephyr.fit_pipeline(pipeline = \"xgb_classifier\", visual=True)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "13221b40", "metadata": {}, "outputs": [ @@ -131,7 +197,7 @@ "dict_keys(['threshold', 'scores'])" ] }, - "execution_count": 5, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -150,13 +216,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "id": "c7a88d5b", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHJCAYAAABjZPjUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAABWmElEQVR4nO3de1hU1f4/8PdcGAaBESgRFEmyBDERU1AUFK2oU1qd8ldZaKKgWUbe0DyZec8UJJEwMdG0zM5Rs/T4PcdjmZUVillZplneULmoXEau41x+f8BsmbjIwMzeOL5fz+Mj7NmzWbOYy5vPWnttmclkMoGIiIjIQcmlbgARERGRPTHsEBERkUNj2CEiIiKHxrBDREREDo1hh4iIiBwaww4RERE5NIYdIiIicmgMO0REROTQGHaI2hix1/l0lHVFHeVxSI39SI6IYYeoEaNHj0ZgYKDFv379+mHMmDE4ePCgzX+eTqfDkiVLsHPnzlYfa/To0Rg9enST++Tn52PChAm4cOGCVcd+9dVXMWzYsNY0zyrDhg3Dq6++2ujtWq0WM2fORE5OjrCtOY/fnrZv347AwECcP3++1ccKDAzEqlWrmtzHVo/3X//6F956660b7nf06FGMHj0affr0QWRkJFasWAGdTtfkfYxGI9atW4eYmBiEhITg0UcfxWeffVZvv+3bt2P48OHo1asX7rvvPqSnp0Ov17f4MREBDDtETQoODsbHH3+Mjz/+GJs3b8bSpUvh5OSE8ePH4+TJkzb9WYWFhXj//fdFe2P/9ttvsX//fqvv9+KLLyI9Pd0OLWqZ3377DZ9++imMRqPUTbnprV69GiUlJU3uk5ubi7i4ODg7O+Ptt9/GuHHjsH79eixatKjJ+61cuRKpqakYOXIk1qxZg4EDByIpKQm7du0S9nn//fcxe/Zs3HnnnUhPT8fLL7+M7du3Y+rUqbZ4eHQLU0rdAKK2zM3NDaGhoRbbBg4ciIiICGzfvh2zZs2SpmES8vf3l7oJJKG1a9fC1dUVGRkZUKlUGDJkCNRqNRYuXIgXXngBnTp1qnefyspKbNy4EaNHj8aECRMAABEREfj111+xadMmDB8+HAaDARkZGRg0aBDS0tKE+wYHB2PEiBE4cOAABg0aJNrjJMfCyg6RlVxcXODs7AyZTGaxfe/evXjiiSfQq1cvDBo0CIsWLUJFRYVwe1VVFebNm4fBgwfjnnvuwUMPPYR169YBAM6fP4/77rsPADB79uwmh4mqq6vxzjvv4KGHHkKvXr0QExODzMzMBisb77zzDgYOHIg+ffrgxRdfRG5uLoCaoYLZs2cDAO677z5hmKiqqgopKSmIiYnBPffcg3vvvRdxcXH47bffhGP+dRhr2LBhSEtLw1tvvYWBAwciJCQE48ePx5kzZyzakpOTg9jYWPTu3Rvh4eGYNWsWioqKLPY5fvw44uLi0KdPHwwdOrTBYY66srOzMWbMGADAmDFjLIZyTCYT1q5di+joaISEhODpp5/Gzz//LNy+atUqPPDAA0hPT0d4eDgiIyNRWloKoGY455FHHsE999yD6OhorFq1CgaDQbhvUVERpk+fjkGDBqFXr1547LHHsGPHjnrt++mnn/DMM8+gV69eiI6OxnvvvWdx+9WrV/Hmm2/i/vvvR69evTB8+HBs3bq1ycd88eJFTJ48GX379sWgQYOwfv36Jvc3O378OCZPnowBAwagZ8+eiIqKwqJFi1BVVQWg5vd44cIFfPLJJ00OwX3zzTcYMmQIVCqVsO2hhx6C0WjEN9980+B9VCoVPvroI4wbN85iu5OTE6qrqwEAly9fRklJCaKjoy326d69Ozw9PfHll18263ESNYSVHaImmEwmYVjJZDKhpKQE77//PnQ6HZ588klhv507d2LGjBkYMWIEpkyZggsXLiA1NRV//PEH1q9fD5lMhiVLluCbb77BrFmzcPvtt+Orr77CsmXL4OHhgREjRiA9PR2TJ0/GpEmTEBMT02h7XnjhBfz444+YPHkygoKCkJ2djbfffhu5ublYuHChsO/hw4dx5coVzJ07FwaDASkpKRgzZgx27tyJ6OhoTJo0CatXr0Z6ejoCAwMBQJj7Mm3aNPj7++Ps2bNYuXIlpk+fjn//+9/1Ap7Zxo0b0bdvX7z55psoLS3F4sWLMWvWLHz88ccAgEOHDiEuLg4DBgzA22+/jdLSUqxcuRJjxozB1q1boVarUVBQgNjYWHTt2hXLly9HWVkZkpOTceXKlUZ/Pz179sTcuXOxYMECzJ07F/3797d4/DqdDq+//jr0ej2WLl2KSZMmYf/+/VAqa976Ll68iP379yM1NRUlJSVo37491qxZg9TUVMTGxmL27Nn47bffsGrVKuTl5WHJkiUAgKSkJFy5cgXz58+Hm5sbPv30U8yaNQs+Pj4YMGCA0IZ58+YhMTERr7zyCv75z39i+fLl6NatG4YOHYqqqio8++yzuHLlChITE9G5c2fs3bsXr732Gi5fvowXXnih3uOtqKhAbGwslEolFi5cCLlcjrS0NJw7dw59+vRptJ8KCwvx3HPPITQ0FEuXLoVKpcJXX32F9evXw9vbGxMmTEB6ejomTJiA4OBgvPjii/D29q53nKqqKly4cAEBAQEW2728vODm5obTp083+PMVCgWCgoIA1DyHr1y5gu3bt+Pbb7/FggULAAAajQZKpRIXL160uG9paSm0Wq0Q1IlagmGHqAmHDh1Cz549622fNm0aunXrBqDmzTs5ORlRUVFITk4W9unatSvGjh2L/fv3Izo6GgcPHsSgQYPwyCOPAAD69++Pdu3a4bbbboNKpUKPHj0A1AwTBQcHN9ier776Ct9++y1WrFghHGfQoEFQq9VCeLj77rsB1HzAZGVlwcfHBwBw55134vHHH8eOHTsQGxsrDEf16NEDfn5+0Ol0KC8vx5w5c/Dwww8DAMLDw1FWVoalS5fi8uXL6NChQ4Pt0mg0yMjIgEKhAACcO3cOq1atQnFxMTw9PZGSkoKAgACsWbNG2Kd379545JFHsG3bNjz33HPYsGEDDAYDMjMz4eXlBQAICAjAU0891ejvx83NDXfddRcA4K677hK+BmqqCZmZmfDw8ABQM5F5zpw5+OOPP4QPXr1ej1mzZqFfv34AaiotGRkZePrppzFnzhwAQGRkJDw8PDBnzhzExcXh7rvvxsGDB/HSSy/h/vvvF/rJw8PDotoB1DxPRo0aBQAIDQ3F//73P3z//fcYOnQotm/fjt9//x1btmwRgkpUVBT0ej0yMjLwzDPPCG03++STT3Dx4kXs2rVLeKy9e/fGAw880GgfAcDvv/+OHj16YOXKlXBzcwNQMxx74MABZGdnCyFHpVLBy8ur3tCt2dWrV4V+/ytXV1eUlZU12Q4A+Pe//43p06cDAKKjo/Hoo48CqKmY/u1vf8MHH3yAu+66Cw888ACuXLmCxYsXQ6FQoLKy8obHJmoMww5RE3r27In58+cDqAk1Wq0WX331FVJTU1FRUYGpU6fi1KlTyM/Px8SJEy0mF4eFhcHNzQ0HDhxAdHQ0+vfvjy1btiA/Px9DhgzBkCFD8NJLL1nVnoMHD0KpVOKhhx6y2P7oo49i5cqVOHjwoBB27r33XiHoADWhpkuXLjh06BBiY2PrHVulUgnDagUFBTh9+jTOnDmDffv2AUCTZ9v06tVLCDEAhJ9bWVkJtVqNn376CePHj7eolHXp0gXdunXDgQMH8Nxzz+Hw4cMIDQ0Vgg5Q80He0ByQ5rjrrrsswoKfnx+A6x/YZuaQCQBHjhxBVVUVhg0bZvG7NA/bHThwAHfffTf69++PVatW4dixY4iKisKQIUManL9lDlFAzYf57bffDq1WC6Dmd9m5c+d6FZlHH30UW7duxU8//YQhQ4ZY3JaTkwN/f3+LUOfr69toODGLjIxEZGQkrl27hj/++ANnz57F77//jqKionqBqik3mgTeWOWvrpCQEHzwwQc4ceIEVq5cifj4eGzatAkymQzz58+HSqXCnDlz8Nprr0GtViMhIQHl5eVwcXFpdjuJ/ophh6gJrq6u6NWrl8W2yMhIVFRU4L333sOYMWOEs1fmz58vBKO6CgsLAQCvvfYafHx88Nlnn2HhwoVYuHAh+vTpg3nz5gmVhhspLS2Fp6enRbAAIFRc6n6Q33777fXuf9tttwkftg35+uuvsWTJEpw6dQqurq4ICgpCu3btADS9/spfP4jk8prpgEajEVqtFkajEWvXrsXatWvr3dfZ2Vl4bOZA0tBjs5a53Q21qS5XV1fha/Pv0jyJ9q/Mv8vU1FS8++67+L//+z/897//hVwux8CBA7FgwQJ07txZ2L+hfjH3Y2lpaYOPzfx7a+j3ZP79/1WHDh1w+fLlBtsM1DzmFStW4MMPP0RFRQV8fX0REhIi9H1zmSs65eXl9W4rKyuDu7v7DY/h7+8Pf39/4Y+BWbNmIScnB2FhYXB1dcWSJUvw2muv4eLFi+jUqRNcXV2xdetW3HHHHVa1laguhh2iFrjnnnvwr3/9C+fPn4dGowFQM98lPDy83r7t27cHUFM5mTRpEiZNmoSLFy9i3759yMjIEObDNEf79u1RXFwMg8FgEXjMH8J1PwjNk23runTpUqNzO86dOycMzaxZswZdunSBTCbDhx9+iK+//rpZ7WuIq6srZDIZxo4dKwy91WUOBJ6eng1+YN/oVGhbMv8uk5OT0bVr13q3m4OIu7s7kpKSkJSUhFOnTuHzzz9HRkYG5s+fj8zMzGb9rPbt2+Ps2bP1tl+6dAkAGgw1np6eDd7nRn2UmZmJDRs2YP78+YiJiRFCyciRI5vVVjNXV1d07NixXhuuXLmC8vJyYWj3r4qKivDVV18hKioKt912m7DdPFxrfv7u27cPGo0Gffv2FSqUV65cQX5+fqNDu0TNwbOxiFrg559/hkKhQJcuXXDnnXfitttuw/nz59GrVy/hX8eOHZGSkoJjx46hqqoKDz74ILKysgAAnTp1wnPPPYdHHnlEmJD512pNQ8LDw6HX6/Gf//zHYrv5rKW+ffsK2w4fPmxR6fnpp59w4cIFYQKtudJh9ssvv6C6uhoTJkyAv7+/MCRhDjotXVnXzc0NwcHBOHXqlEX/3H333Vi1ahWys7MBAAMGDMCRI0dQUFAg3PePP/644cTU5vRbc/Xu3RtOTk4oKCiwaKtSqcSKFStw/vx5XLhwAUOGDBF+B3feeScSEhIwcODAepNrmxIWFoYLFy7gyJEjFts/++wzODk5ISQkpN59BgwYgPPnz+Po0aPCtqKiIvz4449N/qzDhw/jrrvuwpNPPikEnYKCAvz+++8Wla6/PicaMmjQIHz55ZcWw5r//e9/oVAoLCZn11VVVYVZs2bVO9PswIEDACBMkN+yZQuWLVtmsc/7778PhUKBoUOH3rBtRI1hZYeoCWVlZRYfJDqdDl988QW2bduGp59+WphfMnXqVMydO1d4U9ZqtcjIyEBBQQF69uwJtVqNnj17Ij09HU5OTggMDMTp06fxySef4MEHHwQA4UPou+++Q7du3dC7d+967Rk8eDD69++POXPmoKCgAEFBQTh48CDWrl2Lv//97xZzOYxGIyZMmIAXXngBxcXFSElJQffu3YUJoeYqxv/+9z8MHjwYPXv2hFKpxPLlyzFu3DjodDps375dOOW37mn01po2bRomTJiA6dOn49FHH4XBYEBWVhZ++uknvPjiiwCA559/Hlu3bsX48ePx8ssvw2AwIDU1FU5OTk0e29xvX375Jdq3b9/sIcGGeHp6Ij4+HitXrkRZWRn69++PgoICrFy5EjKZDEFBQXB3d4ePjw8WLVqEsrIy+Pv745dffsH+/fsxceLEZv+sJ554Aps3b8ZLL72ExMRE+Pn5Cc+tyZMnC7+fuh577DFs3LgRkydPxtSpU+Hm5obVq1ffcC5NSEgIMjIykJmZidDQUJw9exZr1qyBTqezmPir0Whw7NgxHDx4ECEhIVCr1fWOFR8fj3//+9+Ij49HXFwczpw5gxUrVuCpp54S5lfpdDocO3YMPj4+8PHxQadOnfDkk0/inXfegVKpRHBwMHJycpCZmYmRI0cKz9vRo0dj/PjxWLJkCYYNG4bvvvsOa9asQUJCAtd3olZh2CFqwrFjx/D0008L3zs7O8Pf3x9Tp07F+PHjhe3/7//9P7i6uuK9997Dxx9/jHbt2uHee+9FcnIyunTpAgBYsGAB3n77bWRlZeHSpUu47bbbMHLkSLzyyisAaiogcXFx+Pjjj7F//34cOHCg3ge9TCbDmjVrkJaWhg0bNqCoqAh+fn6YNm0a4uLiLPa9//770alTJyQlJUGv12Po0KF47bXXhHka/fv3x8CBA5GSkoLvvvsOmZmZSElJQXp6OiZNmoT27dsjNDQUmzZtwujRo5GTkyP8BW6tyMhIrFu3Dunp6UhMTISTkxN69uyJ9evXC5NrPT098dFHH2Hx4sV49dVX4erqivj4eOzevbvJY999990YPny4MNxWd0XelpgyZQo6dOiAzZs347333kP79u0RERGBadOmCcEqPT0dK1aswMqVK1FcXAxfX19Mnjy50bk+DXFxccGmTZuQkpIihKs777wTixcvbnR4SaVS4f3338eSJUuwePFiyGQyPPXUU+jSpUuTp+hPnDgRxcXF2LhxI9555x34+vriscceE55PWq0WGo0G48aNw5IlSzB+/HisX7/eYoK1Wbdu3ZCVlYVly5YhMTERnp6eGDt2LBITE4V9CgsL8fTTT2Py5Ml4+eWXAdScht+lSxf885//xIULF+Dr64vExESL11FkZCRSUlKwevVqfPzxx+jUqRPmzJkj6aU/yDHITLzqGxERETkwztkhIiIih8awQ0RERA6NYYeIiIgcGsMOEREROTSGHSIiInJoDDtERETk0Bh2iIiIyKFxUUHULINvNNpnuSG5XGa3Y5Ml9rWNGY1Q5J4DABi6+AN1LiXAvhYP+1o87Gvx2KKv5XKZcFmbG2HYAWA0mlBUVP8qvq2lVMrh6ekKrbYCen3Ty7lT67Cv7aC8HB1qL+xYdDoPqL06OPtaPOxr8bCvxWOrvvbycoVC0byww2EsIiIicmgMO0REROTQGHaIiIjIoTHsEBERkUNj2CEiIiKHxrBDREREDo1hh4iIiBwaww4RERE5NIYdIiIicmgMO0REROTQGHaIiIjIoTHsEBERkUNj2CEiIiKHxqueE9ENrdj3J3RqFwCATCaDs7MS1dV6mEwmAIC7sxKjw/ygUTtJ2UwiogYx7BDRDe04mo9KlbrJfW53VeHpezuL1CIiouZj2CGiG3oo2BsdvD0BAHK5DC5qFSqrdDAaTfjudBGO5l1FUeU1iVtJRNQwhh0iuqEnevki6M6OAAClUg5PT1cUF5dDrzfimsGIo3lXUV6tl7iVREQN4wRlIrohjVrR6G2uqpq/mcp0BrGaQ0RkFYYdImqQ3mgUvnZzbrwI7OZcE4RY2SGitophh4gapK26Hl7cVE2EHVZ2iKiNY9ghogaV1Qk7crms0f1cWdkhojaOYYeIGqRtZqXGXNkpZ2WHiNoohh0iatDVquadSm6u7JSxskNEbRTDDhE16GpV88KLefIyww4RtVUMO0TUoKvNDC/mYSydwQSd3niDvYmIxMewQ0QNam5lp53q+ho85TpWd4io7WHYIaIGaaubN+FYIZehnVPtGVmcpExEbRDDDhE1qKy6+de64iRlImrLGHaIqEHaZg5jATz9nIjaNoYdImpQc+fsAKzsEFHbxrBDRA1q7tlYQJ1LRjRzng8RkZgYdoioQVprwo75khE8G4uI2iDJw47RaERaWhqioqIQGhqKhIQE5ObmNrjvqlWrEBgY2OC/2bNni9xyIsdlMpksro11I66s7BBRGyZ52MnIyMDmzZuxcOFCbNmyBUajEfHx8dDpdPX2HTduHL755huLf+PHj0e7du0wduxY8RtP5KCq9UboDKZm7+/Kyg4RtWGShh2dToesrCwkJiYiOjoaQUFBSE1NRX5+Pvbs2VNvf1dXV3To0EH4d+nSJWzcuBFz585FYGCgBI+AyDFZcyYWwDk7RNS2SRp2jh8/jvLyckRERAjbNBoNgoODcejQoRvef8GCBejXrx/+/ve/27OZRLcca+brADwbi4jaNqWUPzw/Px8A4Ovra7Hd29tbuK0x+/btw5EjR7Bjxw6btEWptH3uUyjkFv+T/bCvbav8mmWFRqmUA0rLPq7b1xoXJwBAxTWDXV5Ltyo+r8XDvhaPFH0tadiprKwEAKhUKovtzs7OKC0tbfK+69evx9ChQ9GjR49Wt0Mul8HT07XVx2mMRuNit2OTJfa1bRgvXrX43tPTFXC1fI3U7Wsfr5rbqgwmu76WblV8XouHfS0eMfta0rCjVqsB1MzdMX8NANXV1XBxabwTLl68iOzsbGRmZtqkHUajCVpthU2OVZdCIYdG4wKtthIGA68GbU/sa9vKu1Jm8X1xcTlQe85Ag32trxm+Kq3Q1exLNsHntXjY1+KxVV9rNC7Nrg5JGnbMw1eFhYXw9/cXthcWFjY54Xjv3r3w8vLCoEGDbNYWvd5+T26DwWjX49N17GvbKC63vC6WXm8E/tKvdfvapfYN52qVnv1vB3xei4d9LR4x+1rSwcmgoCC4ubkhOztb2KbVanHs2DGEhYU1er+cnByEh4dDqZQ0qxE5LOsnKPPaWETUdkmaFlQqFWJjY5GcnAwvLy907twZy5cvh4+PD2JiYmAwGFBUVAR3d3eLYa5jx47hySeflLDlRI7NmutiAYCb6vo6OyaTCTKZzB7NIiJqEcmnnScmJmLkyJGYM2cORo0aBYVCgXXr1sHJyQl5eXmIjIzE7t27Le5z6dIleHh4SNNgoluAturajXeqw622smM01ZyRRUTUlkg+DqRQKJCUlISkpKR6t/n5+eHEiRP1tv/0009iNI3olmXtooLOSjkUchkMRhPKqw3C5SOIiNoCySs7RNT2WHPFcwCQyWTCUFYZLxlBRG0Mww4R1WNtZQeoM0mZl4wgojaGYYeI6mlR2GFlh4jaKIYdIrJgMplw1coJysD1Scqs7BBRW8OwQ0QWynUGGEzW30+o7PBioETUxjDsEJEF8+RklcK6tXLMlZ0yLixIRG0Mww4RWTDP13FXW3f6uLCwICs7RNTGMOwQkQXzgoLWhh1XVnaIqI1i2CEiC+ZLRbhbuTAgKztE1FYx7BCRhdLasKNhZYeIHATDDhFZuNrSOTvOPBuLiNomhh0isqCtDStuzk5W3c+tdtirnJUdImpjGHaIyIJ5grJGrbDqfq6s7BBRG8WwQ0QWhGEsZ2snKLOyQ0RtE8MOEVnQtjDssLJDRG0Vww4RWdAKZ2O1bM5Otd4IvcFo83YREbUUww4RWRAmKFt76rnq+hwfnn5ORG0Jww4RWTDP2dGorJugrFTIoVbWvKVwKIuI2hKGHSISGIwm4UKg7i7WVXaA6xcD5SRlImpLGHaISFC3IqOxcoIycH0oi5UdImpLGHaISGCu6rg4yaGQW//2wMoOEbVFDDtEJCht4ZlYZqzsEFFbxLBDRIKrwurJ1g9hAazsEFHbxLBDRIKWLihoxsoOEbVFDDtEJLi+oGDrKjtl1azsEFHbwbBDRALzBOWWh52ayk65jpUdImo7GHaISNDSS0WYuarMlR2GHSJqOxh2iEigbfUEZXNlh8NYRNR2MOwQkaD1E5Rrz8ZiZYeI2hCGHSIS2GrODi8ESkRtCcMOEQlsdTYWKztE1JYw7BCRQBjGau0EZVZ2iKgNYdghIoF5gnL71k5QrtbDZDLZrF1ERK0hedgxGo1IS0tDVFQUQkNDkZCQgNzc3Eb3v3btGlJSUoT9Y2Nj8dtvv4nYYiLHpDcYUXnNCKD1E5QNJqBKb7RZ24iIWkPysJORkYHNmzdj4cKF2LJlC4xGI+Lj46HT6Rrcf968edi+fTuWLFmCbdu2wcvLCwkJCbh69arILSdyLNo682zcWhh2XJzkkMtqvua8HSJqKyQNOzqdDllZWUhMTER0dDSCgoKQmpqK/Px87Nmzp97+ubm52LZtGxYvXoyoqCh069YNixYtgkqlwi+//CLBIyByHNrK66edK8yJxUoymazOwoKct0NEbUPL/nyzkePHj6O8vBwRERHCNo1Gg+DgYBw6dAjDhw+32P/AgQNwd3fH4MGDLfb/4osvRGsz0ZkrFfjxQqnUzbC5i9oqAIB7C+frmLk5K3C1Wo8yXjKCiNoIScNOfn4+AMDX19diu7e3t3BbXadPn0aXLl2wZ88eZGZmoqCgAMHBwXj11VfRrVu3VrVFqbR9kUuhkFv8T/YjVl+bTCZM2vozLpc1PMzqCLzaOdW8Huq8Jup+f6O+rhkCq0aV3miX19WthO8h4mFfi0eKvpY07FRWVgIAVCqVxXZnZ2eUltb/y7msrAxnz55FRkYGZs6cCY1Gg9WrV+PZZ5/F7t27cdttt7WoHXK5DJ6eri26b3NoNC52OzZZsndf5xZV4HKZDkq5DNGB3nb9WVJQyIHYAXfUvB7qvCw9PV0BV8vXSGN97eGqAi6Vw+SktOvr6lbC9xDxsK/FI2ZfSxp21Go1gJq5O+avAaC6uhouLvU7QalUoqysDKmpqUIlJzU1FUOGDMEnn3yC+Pj4FrXDaDRBq61o0X2bolDIodG4QKuthMHAM1PsSay+PnjyEgDgrg6uWP5oD7v9HKkVF5cD5eXwrPt9bTHrRn2tVtTM9ykoKq+5H7UY30PEw74Wj636WqNxaXZ1SNKwYx6+KiwshL+/v7C9sLAQgYGB9fb38fGBUqm0GLJSq9Xo0qULzp8/36q26O14mqzBYLTr8ek6e/f1b3k1Z/3dfbur4/9O6zw+vd5o8T3QeF+3c6pZa0dbec3x+0gkfA8RD/taPGL2taSDk0FBQXBzc0N2drawTavV4tixYwgLC6u3f1hYGPR6PY4ePSpsq6qqQm5uLu644w5R2ky3tpOXaioVd3u7SdyStuv6JSN4NhYRtQ2SVnZUKhViY2ORnJwMLy8vdO7cGcuXL4ePjw9iYmJgMBhQVFQEd3d3qNVq9OvXDwMHDsSsWbOwYMECeHh4IC0tDQqFAo899piUD4VuEb9fKgMAdO/AuSiNcVWZLwbKs7GIqG2QfNp5YmIiRo4ciTlz5mDUqFFQKBRYt24dnJyckJeXh8jISOzevVvYf9WqVQgPD8fkyZMxcuRIlJWVYePGjfDy8pLwUdCtQFt1DXnaagBA9w6s7DTGXNkp46KCRNRGSFrZAQCFQoGkpCQkJSXVu83Pzw8nTpyw2Obm5oZ58+Zh3rx5IrWQqIZ5CKuTxrnVa9E4MvOiguW8GCgRtRGSV3aIbha/m+frsKrTJPPFQFnZIaK2gmGHqJl+L6ydr+PN+TpNYWWHiNoahh2iZjIPY3G+TtNY2SGitoZhh6gZrhmMOHXFfNo5KztNceOFQImojeEsS6JmOFNUgWsGE1xVCnTSqG98h1uYa21lp6TyGpbuPSlsVysVeLZvZ3i7O0vVNCK6RTHsEDXD74W1Q1jebpDJZBK3pm3zcHGCUi6D3mjCtp/yLG5TKmSYHBUgUcuI6FbFsEPUDFxMsPncnJVY8fee+KX20hoAcDi3BIdzS1HOeTxEJAGGHaJm+J2Tk60S0dULEV2vL/QpA3A4txQGk0m6RhHRLYsTlIluwGQy4SRPO28Vhbxm6E9vYNghIvEx7BDdQGGZDqVVeijkMgTcxrDTEsrasMPKDhFJgWGH6AbMiwl29XKBs5IvmZZgZYeIpMR3bqIbuD45mfN1Wkopr3mrYWWHiKTAsEN0A8LKyd4MOy2lVLCyQ0TSYdghugHhmlg87bzFlLVrE+mNDDtEJD6GHaImlOv0yC2pAsBhrNYwV3YMDDtEJAGGHaIm/FE7hOXtpoJHOyeJW3PzUgiVHaPELSGiWxHDDlETfud8HZtgZYeIpMSwQ9SEs0UVAIAAr3YSt+TmpuCcHSKSEMMOURPytdUAAN/2vNJ5awhnYzHsEJEEGHaImpCnrZmc7KtxlrglNzdhUUGGHSKSAMMOURPyr9ZUdnw0rOy0hnC5CIYdIpIAww5RI8p1emir9ABY2Wkt8wrKrOwQkRQYdogakVc7X0ejVsJVpZS4NTc3DmMRkZQYdogakV87X8fHnVWd1uIwFhFJiWGHqBHmyo4v5+u0Gis7RCQlhh2iRgiVHc7XaTVWdohISgw7RI1gZcd2rld2eLkIIhIfww5RI/K5xo7NsLJDRFJi2CFqhLmywzV2Wk/JOTtEJCGGHaIG6PRGXC7XAWBlxxZY2SEiKTHsEDWgoHblZGelHB4uThK35uZnXlTQaAKMJgYeIhIXww5RA+peE0tWe8VuajnzBGWA1R0iEh/DDlED8jlfx6bMVz0HOG+HiMQnedgxGo1IS0tDVFQUQkNDkZCQgNzc3Eb3/+yzzxAYGFjv3/nz50VsNTk6Xu3cthR1qmN6A8MOEYlL8gv+ZGRkYPPmzVi6dCl8fHywfPlyxMfHY+fOnVCpVPX2P3HiBMLDw7FixQqL7V5eXmI1mW4BeVe5xo4t1a3scBiLiMQmaWVHp9MhKysLiYmJiI6ORlBQEFJTU5Gfn489e/Y0eJ/ff/8dgYGB6NChg8U/hUIhcuvJkXH1ZNuSy2Qwxx0uLEhEYpO0snP8+HGUl5cjIiJC2KbRaBAcHIxDhw5h+PDh9e5z4sQJDBs2zOZtUSptn/sUCrnF/47KZDLho8MXcPpKhWRtUMhleKq/P+72dLHJ8cxzdvw8Xezy3Lgp1HncSqVc+L6lz2ulQoZrBhMgl926fWqlW+U9pC1gX4tHir6WNOzk5+cDAHx9fS22e3t7C7fVVVpaioKCAuTk5GDz5s0oLi5GSEgIkpKSEBAQ0OJ2yOUyeHq6tvj+N6LR2OYDuK3KPnUFyz//Q+pm4NC5EnwxI7rVxzEYTcKp50FdvODp4di/v0bVGUX29HQFXC1fI9Y+r5VyOa4ZDHB1c4GnZztbtPCW4ejvIW0J+1o8Yva1pGGnsrISAOrNzXF2dkZpaWm9/U+ePAmgppLw5ptvoqqqCqtXr8azzz6LnTt34vbbb29RO4xGE7Ra21clFAo5NBoXaLWVMBgct3T/r4PnAAChnTUYECDB3CkTsPbbszh1uRwncovh7VZ/rpc1CrRV0BtNUMplUBkMKC4ut1FDbzLl5fCs/bK4uByoWWOxxc9r8x9xRSXlcJdz3k5z3CrvIW0B+1o8tuprjcal2dUhScOOWl0z+VOn0wlfA0B1dTVcXOonvn79+uG7776Dp6ensPZJeno6oqOjsX37dkyYMKHFbdHr7ffkNhiMdj2+lPQGI/aeuAQAGD/AHwO6SjNR/NvTRfj5ohbfnbqCR4I7tupY54trQri3mwomo+nWPVW6znNWrzdafA9Y/7yuWVjQgCqdwWFfD/biyO8hbQ37Wjxi9rWkg5Pm4avCwkKL7YWFhejYseEPLC8vL4tF3lxcXODn54eCggL7NZQadSi3BCWV1+Dp4oR+/p43voOdhN9R87Ozzxa3+li8JpZ9KHjJCCKSiKRhJygoCG5ubsjOzha2abVaHDt2DGFhYfX2//jjj9G/f39UVFwfciorK8OZM2dw1113idJmsvTf4zVVnfu63y5c/0gK/bvWhp0zxTC18nIEXGPHPngxUCKSiqRhR6VSITY2FsnJyfj8889x/PhxTJ06FT4+PoiJiYHBYMClS5dQVVXz4TN48GAYjUbMnDkTJ0+exNGjR/Hyyy/Dy8sLTzzxhJQP5ZZUrTfiy5OXAQAPBnlL2paQThq4OClQVHENf15u3fwrrp5sHwqGHSKSiOTn2CUmJmLkyJGYM2cORo0aBYVCgXXr1sHJyQl5eXmIjIzE7t27AdQMe23YsAEVFRUYNWoUxo4dC3d3d2zcuBHOzvwrXGzfni5Cuc4AbzcVQjprJG2LSilHWO3k6IPnWjeUxcqOffDK50QkFclXUFYoFEhKSkJSUlK92/z8/HDixAmLbT179kRWVpZYzaMm7KkdwooJ8oa8DVwsM/Ku2/DV75dw6FwJnu3r1+LjsLJjH9crO5z8SUTikryyQzenCp0BX5+6AgCICeogcWtqDLqrZumBH3JLoW/h6Ywmk6lOZYdhx5ZY2SEiqTDsUIt89ecVVOuN6OKhRpC3m9TNAQD08NHAw8UJFdcM+DX/aouOUVqpR1XtqZAd3TmMZUucoExEUmHYoRbZc7xmuYCYIG+LpQCkJJfLEH6HBwDg4NmSFh0j72pNVec2VxWceUkDm2Jlh4ikwndzslpp5TV8d6ZmEnBbGcIy61+73k5LJymb19jh5GTbY2WHiKQi+QRlR2UwmvDliUuoMl1CRYXOof6a/b2wDHqjCXd3cMWdt9nvmmItYV5v52jeVVToDGinUlh1/7zS2qudu3O+jq1xUUEikgrDjp3k5JYgacevUjfDrh4IbFtVHQDo7OGCTu3VuFhahSPnSzHoTusuX8HTzu2n5nIRrOwQkfgYduykp487nr63E65eM0KnM7R6Vd+2pr3aCf8vtJPUzWhQuL8HdhzNx8FzxVaHHfNp577tWdmxNeHUc4NjvRaIqO1j2LETN2clXn2gOzw9XVFcXM4Ly4ko/A5P7Diaj0PnSqy+Lys79iPM2XGw4E9EbR/DDjmcfl3aAwBOXirHjp/zoLLirKoL5jk7XGPH5ljZISKpMOyQw/Fsp0L3Dq74/VI5Fv/vZIuOwcqO7QmnnrOyQ0QiY9ghh5Q45E58dPhCi878Cb/DA64qvjRs7Xplh0O6RCQuvqOTQ+p/h6ew5g61DVxUkIikwkUFiUgUCi4qSEQSYdghIlGwskNEUmHYISJRKBVcVJCIpMGwQ0SiUMg4jEVE0mDYISJRKBUcxiIiaTDsEJEoOEGZiKTCsENEouAEZSKSCsMOEYlCuDaWkYsKEpG4GHaISBSs7BCRVFq8gnJpaSlycnJQWFiIBx98ECUlJQgICICs9owLIqK6OGeHiKTSorCzevVqrFmzBlVVVZDJZAgJCcHbb7+N4uJiZGVlQaPR2LqdRHSTY2WHiKRi9TDWBx98gFWrViEuLg7//Oc/Yaq9gnFsbCxyc3OxcuVKmzeSiG5+SlZ2iEgiVoedTZs2YcKECXjllVfQs2dPYfuQIUMwZcoUfPHFFzZtIBE5Bg5jEZFUrA47Fy9eRHh4eIO33Xnnnbh8+XKrG0VEjkcp5+UiiEgaVocdX19fHDlypMHbfvnlF/j6+ra6UUTkeFjZISKpWD1BeeTIkVi1ahXUajWio6MBABUVFfjvf/+LNWvWIC4uztZtJCIHwAnKRCQVq8NOQkICzp8/j+TkZCQnJwMAxowZAwAYMWIEJk6caNsWEpFDECo7BoYdIhJXi049X7BgAcaNG4fvv/8eJSUlcHd3R1hYGLp3727r9hGRgxAqOyaGHSISl9VhZ8SIEZg+fTqGDh2Krl272qFJROSIrld2eLkIIhKX1ROU8/Ly4OLiYo+2EJEDY2WHiKRiddgZMWIENmzYgMLCQps0wGg0Ii0tDVFRUQgNDUVCQgJyc3Obdd/PPvsMgYGBOH/+vE3aQkT2o1Rwzg4RScPqYawzZ84gJycHQ4YMgYeHB9q1a2dxu0wmw969e5t9vIyMDGzevBlLly6Fj48Pli9fjvj4eOzcuRMqlarR+124cAELFiywtvlEJBGFjJUdIpKG1WHH19cXI0aMsMkP1+l0yMrKwowZM4TT2FNTUxEVFYU9e/Zg+PDhDd7PaDQiKSkJPXv2xPfff2+TthCRfSkVtYsKsrJDRCKzOuy8+eabNvvhx48fR3l5OSIiIoRtGo0GwcHBOHToUKNh591338W1a9cwefJkhh2im4RSxkUFiUgaLTr1HAC++uorHDx4EFqtFp6enujXrx+ioqKsOkZ+fj4A1Ft12dvbW7jtr37++WdkZWVh69atKCgoaFnjG6BUWj196YYUtX/Jmv8n+2Ff20Gd14RSKRe+b2lfO6tq9jeYTHZ5vTkiPq/Fw74WjxR9bXXY0el0ePHFF/HNN99AoVDA09MTxcXFyMzMxIABA7BmzZom59rUVVlZCQD19nd2dkZpaWm9/SsqKjBjxgzMmDEDXbt2tVnYkctl8PR0tcmxGqLR8Ow1sbCvbajOy9LT0xVwtXyNWNvXntdqKjoGo8murzdHxOe1eNjX4hGzr60OO6tWrcLhw4exbNkyPPLII1AoFNDr9di1axfmz5+P1atX45VXXmnWsdRqNYCaAGX+GgCqq6sbPL190aJFCAgIwDPPPGNts5tkNJqg1VbY9JhATWrVaFyg1VbCwLVF7Ip9bQfl5fCs/bK4uBzQ1Xzd0r6uKKsCUDOMVVxcbuPGOiY+r8XDvhaPrfpao3FpdnXI6rCza9cuTJ48GY8++uj1gyiVePzxx3HlyhV89NFHzQ475uGrwsJC+Pv7C9sLCwsRGBhYb/9t27ZBpVKhT58+AACDwQAAGD58OF544QW88MIL1j4cgV5vvye3wWC06/HpOva1DdXpR73eaPE90IK+rj0LS8/fkdX4vBYP+1o8Yva11WGnqKgIwcHBDd4WHBxs1dBSUFAQ3NzckJ2dLYQdrVaLY8eOITY2tt7+e/bssfj+p59+QlJSEjIzM3mpCqI2jhcCJSKpWB12/P39cfjwYYszqMwOHTpUb7JxU1QqFWJjY5GcnAwvLy907twZy5cvh4+PD2JiYmAwGFBUVAR3d3eo1WrccccdFvc3T2Lu1KkTPDw8rH0oRCSi6ysoAyaTCbLas7OIiOzN6qnQzzzzDNasWYP33nsPeXl5uHbtGvLy8rB27VqsXbsWTz75pFXHS0xMxMiRIzFnzhyMGjUKCoUC69atg5OTE/Ly8hAZGYndu3db20wiamPM18YCWN0hInHJTCbrljM1Go14/fXXsW3bNou/zEwmE/7+979jyZIlN91fbAaDEUVFtp8wqVTK4enpiuLico4B2xn72g7Ky9EhoKZSe+l0nnA2Vkv7ukJnwJBVBwAAXycOgtpJYfs2Oxg+r8XDvhaPrfray8vVfhOU5XI5Fi9ejHHjxuHgwYMoLS1F+/btER4ejm7dulndWCK6NdSt7HBhQSISU4sWFTx8+DC+//57vPTSSwCAY8eOIS0tDQkJCbjnnnts2kAicgxKhh0ikojVc3b279+P559/Ht98842wTSaT4cyZM3j22WeRk5Nj0wYSkWOok3UYdohIVFaHnVWrVuGRRx7B5s2bhW09evTAp59+ir/97W9YsWKFTRtIRI5BJpPx9HMikoTVYefPP//E448/3uAk5McffxzHjx+3ScOIyPGY5+3ojZwASkTisTrsuLu74/Tp0w3elpubi3bt2rW6UUTkmK5XdiRuCBHdUqwOOw888ABWrlyJffv2WWz/+uuvsXLlSjzwwAM2axwRORYlKztEJAGrz8aaOnUqjh49ikmTJsHJyQkeHh4oKSmBXq9H7969MX36dHu0k4gcgIJzdohIAlaHHTc3N2zZsgX79+/HDz/8gJKSEri7u6Nfv36Ijo6GXG51sYiIbhHXKzsMO0QknhatsyOXyzF06FAMHToUAKDX61FWVsagQ0RNEsKOgWGHiMRjdTrR6/VIT0/Hzp07AQDZ2dkYNGgQIiIi8Pzzz6O0tNTmjSQix6CsXdqdw1hEJCarw05aWhpWr14NrVYLAFi0aBE8PDwwe/ZsnDt3DikpKTZvJBE5BoWMw1hEJD6rw86///1vTJs2Dc899xz+/PNPnDx5EpMmTcKYMWMwdepUfPHFF/ZoJxE5AKWCE5SJSHxWh53CwkL07t0bAPDll19CLpdj8ODBAAAfHx9cvXrVti0kIofByg4RScHqsOPt7Y3z588DAL744gv06NEDXl5eAIAjR47Ax8fHti0kIodhruww7BCRmKwOO8OHD8ebb76J8ePH4/Dhw3jyyScBAIsXL8aqVaswYsQImzeSiByDubJj4KKCRCQiq089nzJlCtq1a4dDhw5h+vTpePbZZwEAR48exbhx4/Diiy/avJFE5BhY2SEiKVgddmQyGSZOnIiJEydabN+yZYvNGkVEjomLChKRFLgKIBGJRsGwQ0QSYNghItEo5VxUkIjEx7BDRKJhZYeIpMCwQ0SiUfKq50QkAYYdIhINKztEJAWGHSISDSs7RCQFhh0iEs31yg4XFSQi8TRrnZ1hw4ZBVrvy6Y3IZDLs3bu3VY0iIsfEyg4RSaFZYScuLg5vvfUW3NzcMHToUHu3iYgcFBcVJCIpNCvsjB49Gl5eXpg+fTruu+8+3H///fZuFxE5IGEYy8CwQ0TiafacnUceeQRPPfUU3nzzTRgMBnu2iYgclLCooIlhh4jEY9W1saZMmQK1Wo1z584hICDAXm0iIgelrP3zipUdIhJTs8JOdXU1nJ2d4eXlhX/84x/2bhMROShWdohICs0axho2bBiOHDkCAEhPT0dBQYFdG0VEjolzdohICs0KO1evXkVhYSEA4J133mHYIaIWEc7GYmWHiETUrGGsXr16Yfr06XjrrbdgMpnw0ksvQaVSNbivtevsGI1GpKen41//+heuXr2KsLAwzJ07F126dGlw/19//RXLli3Dzz//DGdnZ8TExCApKQnu7u7N/plEJI3rlR0uKkhE4mlW2FmxYgU2bNiAkpIS7NixA8HBwfDy8rJJAzIyMrB582YsXboUPj4+WL58OeLj47Fz5856gery5cuIi4vD/fffj3nz5qG4uBivv/46Xn31Vbzzzjs2aQ8R2c/1RQUlbggR3VKaFXY6duyIWbNmAQCys7MxdepUBAUFtfqH63Q6ZGVlYcaMGYiOjgYApKamIioqCnv27MHw4cMt9r9w4QIiIyOxYMECKJVKBAQE4KmnnkJqamqr20JE9qdU8HIRRCQ+q6+N9cUXX9gk6ADA8ePHUV5ejoiICGGbRqNBcHAwDh06VG//3r17Y8WKFVAqazLan3/+iU8//RSDBg2ySXuIyL4UMl4ugojEZ9U6O7aWn58PAPD19bXY7u3tLdzWmAcffBBnzpxB586dkZ6e3uq2KJW2vyaqQiG3+J/sh31tB3VeE0qlXPi+NX2tclIAAAwm+7zmHA2f1+JhX4tHir6WNOxUVlYCQL25Oc7OzigtLW3yvsnJyaisrMTy5csxZswYfPrpp3B1dW1RO+RyGTw9W3bf5tBoXOx2bLLEvrahOi9LT09X4C+vr5b0dXt3NQBAprDva87R8HktHva1eMTsa0nDjlpd88an0+mEr4GaRQxdXJruhF69egGoWfdnyJAh+N///ofHH3+8Re0wGk3QaitadN+mKBRyaDQu0GorYeCMTLtiX9tBeTk8a78sLi4HdDVft6avqytrDlJVra85JjWJz2vxsK/FY6u+1mhcml0dkjTsmIevCgsL4e/vL2wvLCxEYGBgvf1PnTqFc+fOCZOZgZrJ0x4eHq1e+0evt9+T22Aw2vX4dB372obq9KNeb7T4HmhdX18zmPh7sgKf1+JhX4tHzL6WdHAyKCgIbm5uyM7OFrZptVocO3YMYWFh9fb/9ttvkZiYCK1WK2w7d+4ciouL0a1bN1HaTEQtd/3Uc05QJiLxSBp2VCoVYmNjkZycjM8//xzHjx/H1KlT4ePjg5iYGBgMBly6dAlVVVUAgOHDh8PDwwNJSUk4efIkcnJykJiYiJCQEAwdOlTKh0JEzSAsKsiwQ0QiknzaeWJiIkaOHIk5c+Zg1KhRUCgUWLduHZycnJCXl4fIyEjs3r0bAODh4YH3338fADBq1Ci89NJLCA4Oxrp166BQKKR8GETUDKzsEJEUJJ2zAwAKhQJJSUlISkqqd5ufnx9OnDhhsS0gIABr1qwRq3lEZEPCtbG4qCARiUjyyg4R3TpY2SEiKTDsEJFolJyzQ0QSYNghItFwgjIRSYFhh4hEo5TXvOVwGIuIxMSwQ0SiYWWHiKTAsENEouEEZSKSAsMOEYmGlR0ikgLDDhGJhmdjEZEUGHaISDRKxfVhLJOJgYeIxMGwQ0SiUchkwtcGZh0iEgnDDhGJxlzZAQC9gZeMICJxMOwQkWjqVnY4b4eIxMKwQ0SiUSquv+Xw9HMiEgvDDhGJps4oFis7RCQahh0iEo1MJhPW2mFlh4jEwrBDRKLiWjtEJDaGHSISFS8ZQURiY9ghIlHxkhFEJDaGHSISFSs7RCQ2hh0iEtX1OTtcVJCIxMGwQ0Si4jAWEYmNYYeIRMVhLCISG8MOEYmKlR0iEhvDDhGJSimvedth2CEisTDsEJGoWNkhIrEx7BCRqDhnh4jExrBDRKJiZYeIxMawQ0SiYmWHiMTGsENEouKigkQkNoYdIhKVMIxlYGWHiMTBsENEouIwFhGJjWGHiETFCcpEJDaGHSISlXlRQVZ2iEgskocdo9GItLQ0REVFITQ0FAkJCcjNzW10/5MnT2LChAno378/IiIikJiYiIsXL4rYYiJqDUXtuw4rO0QkFsnDTkZGBjZv3oyFCxdiy5YtMBqNiI+Ph06nq7dvcXEx4uLioFarsWnTJqxduxZFRUWIj49HdXW1BK0nImspFazsEJG4JA07Op0OWVlZSExMRHR0NIKCgpCamor8/Hzs2bOn3v579+5FRUUFli1bhu7du+Oee+7B8uXL8eeff+KHH36Q4BEQkbWUMs7ZISJxSRp2jh8/jvLyckRERAjbNBoNgoODcejQoXr7R0REICMjA2q1Wtgmrx3/12q19m8wEbWaUsGzsYhIXEopf3h+fj4AwNfX12K7t7e3cFtdfn5+8PPzs9iWmZkJtVqNsLCwVrVFqbR97lPUluvN/5P9sK/toM5rQqmUC9+3tq+dao9jhMkurztHwue1eNjX4pGiryUNO5WVlQAAlUplsd3Z2RmlpaU3vP+mTZvwwQcfYM6cOfDy8mpxO+RyGTw9XVt8/xvRaFzsdmyyxL62oTovS09PV8DV8jXS0r52dak5sFLlZNfXnSPh81o87GvxiNnXkoYd83CUTqezGJqqrq6Gi0vjnWAymbBy5UqsXr0akyZNwujRo1vVDqPRBK22olXHaIhCIYdG4wKtthIGA5fGtyf2tR2Ul8Oz9svi4nKg9pyB1va14ZoeAFBWUV1zXGoUn9fiYV+Lx1Z9rdG4NLs6JGnYMQ9fFRYWwt/fX9heWFiIwMDABu9z7do1zJ49G7t27cLs2bMxduxYm7RFr7ffk9tgMNr1+HQd+9qG6vSjXm+0+B5oeV/Lav+/pufvqrn4vBYP+1o8Yva1pIOTQUFBcHNzQ3Z2trBNq9Xi2LFjjc7BmTlzJv7zn/8gJSXFZkGHiMSj5ArKRCQySSs7KpUKsbGxSE5OhpeXFzp37ozly5fDx8cHMTExMBgMKCoqgru7O9RqNbZv347du3dj5syZCA8Px6VLl4RjmfchoraNl4sgIrFJPu08MTERI0eOxJw5czBq1CgoFAqsW7cOTk5OyMvLQ2RkJHbv3g0A2LVrFwBg2bJliIyMtPhn3oeI2jZeLoKIxCZpZQcAFAoFkpKSkJSUVO82Pz8/nDhxQvg+KytLzKYRkR2wskNEYpO8skNEtxbznB1WdohILAw7RCQqTlAmIrEx7BCRqBSs7BCRyBh2iEhU1ys7XMuEiMTBsENEohImKBtY2SEicTDsEJGohAnKJoYdIhIHww4RiUrJyg4RiYxhh4hEpTAvKsjKDhGJhGGHiETFyg4RiY1hh4hExTk7RCQ2hh0iEpVSwcoOEYmLYYeIRKWQsbJDROJi2CEiUV2v7HBRQSISB8MOEYmKVz0nIrEx7BCRqHghUCISG8MOEYmKFwIlIrEx7BCRqJS1iwqyskNEYmHYISJRKWrfdRh2iEgsDDtEJCpzZYfDWEQkFoYdIhJV3QnKJq61Q0QiYNghIlGZJygDAIs7RCQGhh0iEpWyTtjhvB0iEgPDDhGJyjLscBVlIrI/hh0iElXdsMNJykQkBoYdIhKVgsNYRCQyhh0iEpVMJkPttUBZ2SEiUTDsEJHoeDFQIhITww4RiY4LCxKRmBh2iEh0QmXHwLBDRPbHsENEohNWUeYKykQkAoYdIhKdsnaGsoGVHSISAcMOEYlOITNPUOaigkRkfww7RCQ6c2WHZ2MRkRgkDztGoxFpaWmIiopCaGgoEhISkJub26z7xcfHY9WqVSK0kohs6Xplh2GHiOxP8rCTkZGBzZs3Y+HChdiyZYsQYnQ6XaP30el0+Mc//oGvv/5axJYSka0Ic3YYdohIBJKGHZ1Oh6ysLCQmJiI6OhpBQUFITU1Ffn4+9uzZ0+B9fvjhBzzxxBPIycmBRqMRucVEZAus7BCRmCQNO8ePH0d5eTkiIiKEbRqNBsHBwTh06FCD99m/fz+ioqKwY8cOuLu7i9VUIrIhpYKLChKReJRS/vD8/HwAgK+vr8V2b29v4ba/mjp1ql3aolTaPvcpat/Qzf+T/bCv7aDOa0KplAvf26KvzevsmGT2ee05Cj6vxcO+Fo8UfS1p2KmsrAQAqFQqi+3Ozs4oLS0VrR1yuQyenq52O75G42K3Y5Ml9rUN1XlZenq6Aq6Wr5HW9LXaueatR+3ibNfXnqPg81o87GvxiNnXkoYdtVoNoGbujvlrAKiuroaLi3idYDSaoNVW2Py4CoUcGo0LtNpKGAxcT8Se2Nd2UF4Oz9ovi4vLgdpzBmzS17Xr65Reraw5NjWIz2vxsK/FY6u+1mhcml0dkjTsmIevCgsL4e/vL2wvLCxEYGCgqG3R6+335DYYjHY9Pl3HvrahOv2o1xstvgda19fy2gnK1dcM/H01A5/X4mFfi0fMvpZ0cDIoKAhubm7Izs4Wtmm1Whw7dgxhYWEStoyI7Mk8Z4cTlIlIDJJWdlQqFWJjY5GcnAwvLy907twZy5cvh4+PD2JiYmAwGFBUVAR3d3eLYS4iurkJFwJl2CEiEUg+7TwxMREjR47EnDlzMGrUKCgUCqxbtw5OTk7Iy8tDZGQkdu/eLXUziciGWNkhIjFJWtkBAIVCgaSkJCQlJdW7zc/PDydOnGj0vl988YU9m0ZEdqJgZYeIRCR5ZYeIbj2s7BCRmBh2iEh0rOwQkZgYdohIdEp5zVsPww4RiYFhh4hEx7OxiEhMDDtEJDoF5+wQkYgYdohIdEJlx8CwQ0T2x7BDRKK7PkGZy/ITkf0x7BCR6HjqORGJiWGHiETHU8+JSEwMO0QkOlZ2iEhMDDtEJDpWdohITAw7RCQ6paLmrYeVHSISA8MOEYmOiwoSkZgYdohIdFxUkIjExLBDRKJjZYeIxMSwQ0SiU3JRQSISEcMOEYmOp54TkZgYdohIdDz1nIjExLBDRKJjZYeIxMSwQ0SiY2WHiMTEsENEouPZWEQkJoYdIhKdUl7z1sOwQ0RiYNghItFxUUEiEhPDDhGJjsNYRCQmhh0iEp0wQdnARQWJyP4YdohIdMKp5yzsEJEIGHaISHSs7BCRmBh2iEh01ys7LO0Qkf0x7BCR6K5Xdhh2iMj+GHaISHSs7BCRmBh2iEh0SlZ2iEhEDDtEJDrzCsomAEZWd4jIzhh2iEh0SoVM+JrVHSKyN8nDjtFoRFpaGqKiohAaGoqEhATk5uY2un9xcTGmT5+OsLAwhIeHY/78+aisrBSxxUTUWuYJygBXUSYi+5M87GRkZGDz5s1YuHAhtmzZAqPRiPj4eOh0ugb3T0xMxNmzZ7FhwwasXLkS+/fvx7x588RtNBG1irJO2OH1sYjI3iQNOzqdDllZWUhMTER0dDSCgoKQmpqK/Px87Nmzp97+R44cwcGDB/HWW2+hZ8+eiIiIwIIFC/Dpp5+ioKBAgkdARC1hWdnhwoJEZF9KKX/48ePHUV5ejoiICGGbRqNBcHAwDh06hOHDh1vsn5OTgw4dOqBbt27CtvDwcMhkMhw+fBgPP/xwi9uiVNo+9ykUcov/yX7Y13ZQ5zWhVMqF723V13IZYDQBaV+fhlqpaNWxHJVMJoNKpYBOZ4CJE7ntin1tP85KOUb17YzOHi4ApHm/ljTs5OfnAwB8fX0ttnt7ewu31VVQUFBvX5VKBQ8PD+Tl5bW4HXK5DJ6eri2+/41oNC52OzZZYl/bkOr6l56eroCr5WuktX3t2U6FK+U67PqFVVkiR+fprsaMBwMtton5fi1p2DFPLFapVBbbnZ2dUVpa2uD+f93XvH91dXWL22E0mqDVVrT4/o1RKOTQaFyg1VbCwGsA2RX72g7Ky+FZ+2VxcTlQO43OVn2d8vee+P5Mcevb6cDkMhmcnZ1QXX2Np+jbGfvaftRKOYb39K55H4Ht3kM0GpdmV4ckDTtqtRpAzdwd89cAUF1dDReX+olPrVY3OHG5uroa7dq1a1Vb9Hr7fUAaDEa7Hp+uY1/bUJ1+1OuNFt8Dre/rnh3d0bOje4vvfytQKuXw9HRFcXE5n9d2xr62v7/2q5jv15JOcDAPSRUWFlpsLywsRMeOHevt7+PjU29fnU6HkpISeHt726+hREREdNOSNOwEBQXBzc0N2dnZwjatVotjx44hLCys3v5hYWHIz8/H2bNnhW0HDx4EAPTt29f+DSYiIqKbjqTDWCqVCrGxsUhOToaXlxc6d+6M5cuXw8fHBzExMTAYDCgqKoK7uzvUajV69+6Ne++9F1OnTsW8efNQUVGBuXPn4vHHH2+wEkREREQk+Xm6iYmJGDlyJObMmYNRo0ZBoVBg3bp1cHJyQl5eHiIjI7F7924ANacGpqenw8/PD88//zymTJmCwYMHc1FBIiIiapTMxAUFYDAYUVRUbvPjcsKbeNjXdlBejg4BNfPqLp3OE049Z1+Lh30tHva1eGzV115ers0+G0vyyg4RERGRPTHsEBERkUNj2CEiIiKHxrBDREREDo1hh4iIiBwaww4RERE5NIYdIiIicmgMO0REROTQuKggAJPJBKPRPt2gUMhbdQl7aj72tY0ZjVDkngMAGLr4A/Lrfxuxr8XDvhYP+1o8tuhruVwGmUzWrH0ZdoiIiMihcRiLiIiIHBrDDhERETk0hh0iIiJyaAw7RERE5NAYdoiIiMihMewQERGRQ2PYISIiIofGsENEREQOjWGHiIiIHBrDDhERETk0hh0iIiJyaAw7RERE5NAYdoiIiMihMey0gtFoRFpaGqKiohAaGoqEhATk5uY2un9xcTGmT5+OsLAwhIeHY/78+aisrBSxxTcva/v65MmTmDBhAvr374+IiAgkJibi4sWLIrb45mVtX9f12WefITAwEOfPn7dzKx2DtX197do1pKSkCPvHxsbit99+E7HFNy9r+/rKlSuYPn06BgwYgP79+2Pq1KkoKCgQscWOYc2aNRg9enST+4jx2ciw0woZGRnYvHkzFi5ciC1btsBoNCI+Ph46na7B/RMTE3H27Fls2LABK1euxP79+zFv3jxxG32Tsqavi4uLERcXB7VajU2bNmHt2rUoKipCfHw8qqurJWj9zcXa57XZhQsXsGDBApFa6Ris7et58+Zh+/btWLJkCbZt2wYvLy8kJCTg6tWrIrf85mNtX0+ZMgUXL17E+vXrsX79ely8eBEvvfSSyK2+uX344Yd4++23b7ifKJ+NJmqR6upqU58+fUwffvihsK20tNQUEhJi2rlzZ739f/jhB1P37t1Nf/zxh7Dt66+/NgUGBpry8/NFafPNytq+/uc//2nq06ePqbKyUth28eJFU/fu3U3ffvutKG2+WVnb12YGg8E0atQo05gxY0zdu3c35ebmitHcm5q1fX3u3DlTYGCgad++fRb7Dx06lM/rG7C2r0tLS03du3c3ff7558K2vXv3mrp3724qLi4Wo8k3tfz8fNPEiRNNoaGhpoceesgUGxvb6L5ifTaystNCx48fR3l5OSIiIoRtGo0GwcHBOHToUL39c3Jy0KFDB3Tr1k3YFh4eDplMhsOHD4vS5puVtX0dERGBjIwMqNVqYZtcXvNU12q19m/wTczavjZ79913ce3aNUycOFGMZjoEa/v6wIEDcHd3x+DBgy32/+KLLyyOQfVZ29dqtRqurq7YsWMHysrKUFZWhk8//RQBAQHQaDRiNv2m9Ouvv8LJyQmfffYZevfu3eS+Yn02Km12pFtMfn4+AMDX19diu7e3t3BbXQUFBfX2ValU8PDwQF5env0a6gCs7Ws/Pz/4+flZbMvMzIRarUZYWJj9GuoArO1rAPj555+RlZWFrVu3ck6DFazt69OnT6NLly7Ys2cPMjMzUVBQgODgYLz66qsWHxRUn7V9rVKpsHTpUsydOxf9+vWDTCaDt7c3PvjgA+EPJ2rcsGHDMGzYsGbtK9ZnI39rLWSePKVSqSy2Ozs7NzgvpLKyst6+Te1P11nb13+1adMmfPDBB5gxYwa8vLzs0kZHYW1fV1RUYMaMGZgxYwa6du0qRhMdhrV9XVZWhrNnzyIjIwPTpk3D6tWroVQq8eyzz+LKlSuitPlmZW1fm0wm/Pbbb+jTpw8+/PBDvP/+++jUqRNefPFFlJWVidLmW4VYn40MOy1kHiL56+S26upquLi4NLh/QxPhqqur0a5dO/s00kFY29dmJpMJb7/9NhYtWoRJkybd8IwAsr6vFy1ahICAADzzzDOitM+RWNvXSqUSZWVlSE1NRWRkJEJCQpCamgoA+OSTT+zf4JuYtX39f//3f/jggw+wfPly9O3bF+Hh4Xj33Xdx4cIFbN26VZQ23yrE+mxk2Gkhc9mtsLDQYnthYSE6duxYb38fH596++p0OpSUlMDb29t+DXUA1vY1UHOKblJSEt59913Mnj0bU6ZMsXczHYK1fb1t2zZ8++236NOnD/r06YOEhAQAwPDhw/Huu+/av8E3sZa8hyiVSoshK7VajS5duvBU/xuwtq9zcnIQEBAANzc3YVv79u0REBCAs2fP2rextxixPhsZdlooKCgIbm5uyM7OFrZptVocO3aswXkhYWFhyM/Pt3ihHDx4EADQt29f+zf4JmZtXwPAzJkz8Z///AcpKSkYO3asSC29+Vnb13v27MGuXbuwY8cO7NixA4sWLQJQM0eK1Z6mteQ9RK/X4+jRo8K2qqoq5Obm4o477hClzTcra/vax8cHZ8+etRhGqaiowPnz5zlca2NifTZygnILqVQqxMbGIjk5GV5eXujcuTOWL18OHx8fxMTEwGAwoKioCO7u7lCr1ejduzfuvfdeTJ06FfPmzUNFRQXmzp2Lxx9/vNHqBNWwtq+3b9+O3bt3Y+bMmQgPD8elS5eEY5n3oYZZ29d//ZA1T/bs1KkTPDw8JHgENw9r+7pfv34YOHAgZs2ahQULFsDDwwNpaWlQKBR47LHHpH44bZq1ff34449j3bp1mDJlCl555RUAwNtvvw1nZ2c88cQTEj+am5tkn402O4n9FqTX603Lli0zDRgwwBQaGmpKSEgQ1hfJzc01de/e3bRt2zZh/8uXL5tefvllU2hoqKl///6mN954w1RVVSVV828q1vR1XFycqXv37g3+q/v7oIZZ+7yu6/vvv+c6O1awtq+vXr1qeuONN0z9+/c39e7d2xQXF2c6efKkVM2/qVjb13/88Ydp4sSJpvDwcNOAAQNMkydP5vO6BWbNmmWxzo5Un40yk8lksl10IiIiImpbOGeHiIiIHBrDDhERETk0hh0iIiJyaAw7RERE5NAYdoiIiMihMewQERGRQ2PYISIiIofGsENEdsWlvIhIagw7RGQ3n3/+OWbNmgUAyM7ORmBgoMX1icQ2bNgwvPrqq60+zqpVqxAYGNjkPm3h8RJRDV4bi4jsZsOGDVI3gYiIlR0iIiJybAw7RGQXo0ePxsGDB3Hw4EGLIZ9Tp05h/Pjx6N27NwYNGoTk5GTo9Xrh9sDAQKSnp+OJJ55ASEgI0tPTAQAXL17EtGnTEB4ejt69e+P555/HsWPHLH7mrl278OijjyIkJAQDBgzAjBkzUFBQYLHPtWvXsGzZMgwaNAihoaEYN24czp49a7HPgQMH8Oyzz6Jv377o378/pk+fjry8vCYf75YtW/Dggw8iJCQEsbGxuHjxYov6jYhsj2GHiOzijTfeQHBwMIKDg/Hxxx+jrKwMAPDmm2+ib9++ePfdd/G3v/0Na9euxZYtWyzu++6772LEiBFIS0vDgw8+iKKiIjzzzDP49ddf8frrryMlJQVGoxHPPfcc/vzzTwDA4cOHMXPmTMTExGDt2rWYPXs2vv/+e0yfPt3i2Lt378bJkyexdOlSvPHGG/jll18wdepU4fYdO3Zg3Lhx8PX1xYoVKzB79mwcOXIETz/9NK5cudLgY/3ggw/wxhtvYMiQIcjIyEDv3r3x+uuv27I7iagVOGeHiOzirrvugpubGwAgNDRUmKg7ZswYvPjiiwCAAQMGYO/evfj+++8RGxsr3Ldfv36Ii4sTvk9NTUVJSQk++ugjdO7cGQAwePBgPPzww1i5ciXS0tJw+PBhqNVqTJgwASqVCgDg4eGBo0ePwmQyQSaTAQA6duyIjIwMODk5AQDOnj2L1atXo6ysDO3atUNycjIiIyORkpIi/Px7770XDz/8MNatW4eZM2daPE6TyYSMjAw8/PDD+Mc//gEAiIyMRFlZWb0QR0TSYGWHiETVr18/4WuZTIbOnTtDq9Va7NOjRw+L77/77jv06NEDHTt2hF6vh16vh1wux+DBg/Htt98CAMLCwlBZWYnhw4cjJSUFOTk5iIyMxOTJk4WgAwAhISFC0AEAPz8/AIBWq8Xp06dx6dIlDB8+3OLn+/v7o0+fPjh48GC9x3Pq1ClcuXIFQ4cOtdj+t7/9zZpuISI7YmWHiETl4uJi8b1cLq+3Fk+7du0svi8pKcHZs2fRs2fPBo9ZWVmJPn36IDMzExs2bMD69euRmZmJ22+/HS+88AJGjx7d6LHl8pq/+YxGI0pKSgAAt99+e72fcfvtt9ebIwQApaWlAABPT0+L7R06dGiwrUQkPoYdImrz3N3dER4eXm8Iycw8bBUVFYWoqChUVlbi+++/x8aNG7Fo0SL07t0bISEhN/w5Hh4eAIDLly/Xu+3SpUv1Ag1wPeT8dT6POTgRkfQ4jEVEdmOumrRWeHg4Tp8+jYCAAPTq1Uv49+mnn2Lr1q1QKBR466238OSTT8JkMsHFxQVDhw4VFjRs7plRAQEB6NChA3bt2mWxPTc3Fz/++CPuvffeevfp2rUrfH198Z///Mdi+759+1r4aInI1hh2iMhuNBoNTp8+je+++67evBxrjB07FkajEWPHjsXu3bvx3Xff4fXXX8emTZsQEBAAoGay86+//opXX30VBw4cwJdffolFixbBw8MDAwYMaNbPkcvlmDZtGr755htMnz4d+/fvx44dOxAXF4f27dtbTJo2k8lkmDFjBvbt24c5c+bgm2++QXp6Oj766KMWP14isi2GHSKym+eeew5OTk5ISEhAVVVVi4/TsWNHbNmyBZ07d8a8efPwwgsv4Oeff8bixYsxduxYAMCQIUOQnJyMkydPYvLkyZg2bRpcXFywceNGYXiqOZ544gmkpaXh9OnTeOmll7B06VL06dMHW7dubXQezvDhw5Gamooff/wRkyZNwr59+7BgwYIWP14isi2ZiVfpIyIiIgfGyg4RERE5NIYdIiIicmgMO0REROTQGHaIiIjIoTHsEBERkUNj2CEiIiKHxrBDREREDo1hh4iIiBwaww4RERE5NIYdIiIicmgMO0REROTQ/j9MhQWg0HdyfwAAAABJRU5ErkJggg==\n", + "image/png": "", "text/plain": [ "
" ] @@ -182,11 +248,19 @@ "plt.title(f\"Best obtained threshold at {threshold}\")\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0b487c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -200,7 +274,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.0" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index a6b7217..91aa469 100644 --- a/setup.py +++ b/setup.py @@ -54,9 +54,9 @@ 'Sphinx>=3,<3.3', 'sphinx_rtd_theme>=0.2.4,<0.5', 'autodocsumm>=0.1.10', - 'mistune>=0.7,<2', + 'mistune>=0.7,<2.0', 'Jinja2>=2,<3.1', - + # fails on Sphinx < v3.4 'alabaster<=0.7.12', # fails on Sphins < v5.0 @@ -65,7 +65,7 @@ 'sphinxcontrib-htmlhelp<2.0.5', 'sphinxcontrib-serializinghtml<1.1.10', 'sphinxcontrib-qthelp<1.0.7', - + # style check 'flake8>=3.7.7,<4', 'isort>=4.3.4,<5', diff --git a/tests/labeling/test_helpers.py b/tests/labeling/test_helpers.py index 080c312..a43ca9b 100644 --- a/tests/labeling/test_helpers.py +++ b/tests/labeling/test_helpers.py @@ -19,7 +19,8 @@ def test_merge_labeling_and_true(): lambda df: True, lambda df: True ] - assert 1 == merge_binary_labeling_functions(functions, and_connected=True)(pd.DataFrame()) + assert 1 == merge_binary_labeling_functions( + functions, and_connected=True)(pd.DataFrame()) def test_merge_labeling_and_false(): @@ -27,7 +28,8 @@ def test_merge_labeling_and_false(): lambda df: True, lambda df: False ] - assert 0 == merge_binary_labeling_functions(functions, and_connected=True)(pd.DataFrame()) + assert 0 == merge_binary_labeling_functions( + functions, and_connected=True)(pd.DataFrame()) def test_merge_labeling_or_true(): @@ -35,7 +37,8 @@ def test_merge_labeling_or_true(): lambda df: False, lambda df: True ] - assert 1 == merge_binary_labeling_functions(functions, and_connected=False)(pd.DataFrame()) + assert 1 == merge_binary_labeling_functions( + functions, and_connected=False)(pd.DataFrame()) def test_merge_labeling_or_false(): @@ -43,7 +46,8 @@ def test_merge_labeling_or_false(): lambda df: False, lambda df: False ] - assert 0 == merge_binary_labeling_functions(functions, and_connected=False)(pd.DataFrame()) + assert 0 == merge_binary_labeling_functions( + functions, and_connected=False)(pd.DataFrame()) def test_categorical_presence_true(): diff --git a/tests/primitives/test_postprocessing.py b/tests/primitives/test_postprocessing.py index 2b1c65e..696813c 100644 --- a/tests/primitives/test_postprocessing.py +++ b/tests/primitives/test_postprocessing.py @@ -23,7 +23,8 @@ def _run(self, y, y_hat, value): threshold.fit(y, y_hat) assert threshold._threshold == value - binary_y_hat, detected_threshold, scores = threshold.apply_threshold(y_hat) + binary_y_hat, detected_threshold, scores = threshold.apply_threshold( + y_hat) np.testing.assert_allclose(binary_y_hat, y) def test_1d(self): diff --git a/tests/test___init__.py b/tests/test___init__.py index 72b810e..9a67a96 100644 --- a/tests/test___init__.py +++ b/tests/test___init__.py @@ -96,7 +96,8 @@ def merge_work_orders_notifications_data(): changed_wo_data['WTG'] = ['A001', 'A001'] changed_notif_data = NOTIFICATIONS_DATA.copy() # matching the output of the merge - changed_notif_data['Functional location_y'] = changed_notif_data.pop('Functional location') + changed_notif_data['Functional location_y'] = changed_notif_data.pop( + 'Functional location') changed_notif_data['Functional location description_y'] = ( changed_notif_data.pop('Functional location description')) # matching the notifications update diff --git a/tests/test_core.py b/tests/test_core.py index cc747c7..1925e36 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,15 +1,131 @@ -import os -import pickle - import numpy as np import pandas as pd -import pytest +from mlblocks import MLBlock -from zephyr_ml.core import Zephyr +from zephyr_ml.core import DEFAULT_METRICS, Zephyr class TestZephyr: + @staticmethod + def base_dfs(): + alarms_df = pd.DataFrame({ + 'COD_ELEMENT': [0, 0], + 'DAT_START': [pd.Timestamp('2022-01-01 00:00:00'), + pd.Timestamp('2022-03-01 11:12:13')], + 'DAT_END': [pd.Timestamp('2022-01-01 13:00:00'), + pd.Timestamp('2022-03-02 11:12:13')], + 'IND_DURATION': [0.5417, 1.0], + 'COD_ALARM': [12345, 98754], + 'COD_ALARM_INT': [12345, 98754], + 'DES_NAME': ['Alarm1', 'Alarm2'], + 'DES_TITLE': ['Description of alarm 1', 'Description of alarm 2'], + }) + stoppages_df = pd.DataFrame({ + 'COD_ELEMENT': [0, 0], + 'DAT_START': [pd.Timestamp('2022-01-01 00:00:00'), + pd.Timestamp('2022-03-01 11:12:13')], + 'DAT_END': [pd.Timestamp('2022-01-08 11:07:17'), + pd.Timestamp('2022-03-01 17:00:13')], + 'DES_WO_NAME': ['stoppage name 1', 'stoppage name 2'], + 'DES_COMMENTS': ['description of stoppage 1', 'description of stoppage 2'], + 'COD_WO': [12345, 67890], + 'IND_DURATION': [7.4642, 0.2417], + 'IND_LOST_GEN': [45678.0, 123.0], + 'COD_ALARM': [12345, 12345], + 'COD_CAUSE': [32, 48], + 'COD_INCIDENCE': [987654, 123450], + 'COD_ORIGIN': [6, 23], + 'COD_STATUS': ['STOP', 'PAUSE'], + 'COD_CODE': ['ABC', 'XYZ'], + 'DES_DESCRIPTION': ['Description 1', 'Description 2'] + }) + notifications_df = pd.DataFrame({ + 'COD_ELEMENT': [0, 0], + 'COD_ORDER': [12345, 67890], + 'IND_QUANTITY': [1, -20], + 'COD_MATERIAL_SAP': [36052411, 67890], + 'DAT_POSTING': [pd.Timestamp('2022-01-01 00:00:00'), + pd.Timestamp('2022-03-01 00:00:00')], + 'COD_MAT_DOC': [77889900, 12345690], + 'DES_MEDIUM': ['Description of notification 1', 'Description of notification 2'], + 'COD_NOTIF': [567890123, 32109877], + 'DAT_MALF_START': [pd.Timestamp('2021-12-25 18:07:10'), + pd.Timestamp('2022-02-28 06:04:00')], + 'DAT_MALF_END': [pd.Timestamp('2022-01-08 11:07:17'), + pd.Timestamp('2022-03-01 17:00:13')], + 'IND_BREAKDOWN_DUR': [14.1378, 2.4792], + 'FUNCT_LOC_DES': ['location description 1', 'location description 2'], + 'COD_ALARM': [12345, 12345], + 'DES_ALARM': ['Alarm description', 'Alarm description'], + }) + work_orders_df = pd.DataFrame({ + 'COD_ELEMENT': [0, 0], + 'COD_ORDER': [12345, 67890], + 'DAT_BASIC_START': [pd.Timestamp('2022-01-01 00:00:00'), + pd.Timestamp('2022-03-01 00:00:00')], + 'DAT_BASIC_END': [pd.Timestamp('2022-01-09 00:00:00'), + pd.Timestamp('2022-03-02 00:00:00')], + 'COD_EQUIPMENT': [98765, 98765], + 'COD_MAINT_PLANT': ['ABC', 'ABC'], + 'COD_MAINT_ACT_TYPE': ['XYZ', 'XYZ'], + 'COD_CREATED_BY': ['A1234', 'B6789'], + 'COD_ORDER_TYPE': ['A', 'B'], + 'DAT_REFERENCE': [pd.Timestamp('2022-01-01 00:00:00'), + pd.Timestamp('2022-03-01 00:00:00')], + 'DAT_CREATED_ON': [pd.Timestamp('2022-03-01 00:00:00'), + pd.Timestamp('2022-04-18 00:00:00')], + 'DAT_VALID_END': [pd.NaT, pd.NaT], + 'DAT_VALID_START': [pd.NaT, pd.NaT], + 'COD_SYSTEM_STAT': ['ABC XYZ', 'LMN OPQ'], + 'DES_LONG': ['description of work order', 'description of work order'], + 'COD_FUNCT_LOC': ['!12345', '?09876'], + 'COD_NOTIF_OBJ': ['00112233', '00998877'], + 'COD_MAINT_ITEM': ['', '019283'], + 'DES_MEDIUM': ['short description', 'short description'], + 'DES_FUNCT_LOC': ['XYZ1234', 'ABC9876'], + }) + turbines_df = pd.DataFrame({ + 'COD_ELEMENT': [0], + 'TURBINE_PI_ID': ['TA00'], + 'TURBINE_LOCAL_ID': ['A0'], + 'TURBINE_SAP_COD': ['LOC000'], + 'DES_CORE_ELEMENT': ['T00'], + 'SITE': ['LOCATION'], + 'DES_CORE_PLANT': ['LOC'], + 'COD_PLANT_SAP': ['ABC'], + 'PI_COLLECTOR_SITE_NAME': ['LOC0'], + 'PI_LOCAL_SITE_NAME': ['LOC0'] + }) + pidata_df = pd.DataFrame({ + 'time': [pd.Timestamp('2022-01-02 13:21:01'), + pd.Timestamp('2022-03-08 13:21:01')], + 'COD_ELEMENT': [0, 0], + 'val1': [9872.0, 559.0], + 'val2': [10.0, -7.0] + }) + return { + 'alarms': alarms_df, + 'stoppages': stoppages_df, + 'notifications': notifications_df, + 'work_orders': work_orders_df, + 'turbines': turbines_df, + "pidata": pidata_df + } + + def base_train_test_split(self): + X_train = pd.DataFrame({ + 'feature 1': np.random.random(300), + 'feature 2': [0] * 150 + [1] * 150, + }) + y_train = X_train['feature 2'].to_list() + X_test = pd.DataFrame({ + 'feature 1': np.random.random((100)), + 'feature 2': [0] * 25 + [1] * 50 + [0] * 25, + }) + y_test = X_test['feature 2'].to_list() + return X_train, X_test, y_train, y_test + @classmethod def setup_class(cls): cls.train = pd.DataFrame({ @@ -17,165 +133,190 @@ def setup_class(cls): 'feature 2': [0] * 150 + [1] * 150, }) cls.train_y = cls.train['feature 2'].to_list() - cls.test = pd.DataFrame({ 'feature 1': np.random.random((100)), 'feature 2': [0] * 25 + [1] * 50 + [0] * 25, }) cls.test_y = cls.test['feature 2'].to_list() - cls.random = pd.DataFrame({ 'feature 1': list(range(100)), 'feature 2': np.random.random(100), 'feature 3': np.random.random(100), }) cls.random_y = [1 if x > 0.5 else 0 for x in np.random.random(100)] - - def setup_method(self): - self.zephyr = Zephyr('xgb_classifier') - - def test_hyperparameters(self): - hyperparameters = { - "xgboost.XGBClassifier#1": { - "max_depth": 2 - }, - "zephyr_ml.primitives.postprocessing.FindThreshold#1": { - "metric": "precision" - } - } - - zephyr = Zephyr('xgb_classifier', hyperparameters) - - assert zephyr._hyperparameters == hyperparameters - - def test_json(self): - file = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - json_zephyr = Zephyr(os.path.join(file, 'zephyr_ml', 'pipelines', 'xgb_classifier.json')) - - json_zephyr_hyperparameters = json_zephyr._mlpipeline.get_hyperparameters() - zephyr_hyperparameters = self.zephyr._mlpipeline.get_hyperparameters() - assert json_zephyr_hyperparameters == zephyr_hyperparameters - - def test_fit(self): - self.zephyr.fit(self.train, self.train_y) - - def test_fit_visual(self): - output = self.zephyr.fit(self.train, self.train_y, visual=True) - - assert isinstance(output, dict) - assert list(output.keys()) == ['threshold', 'scores'] - - def test_fit_no_visual(self): - zephyr = Zephyr(['xgboost.XGBClassifier']) - - output = zephyr.fit(self.train, self.train_y, visual=True) + cls.kwargs = { + "generate_entityset": { + "dfs": TestZephyr.base_dfs(), + "es_type": "pidata"}, + "generate_label_times": { + "labeling_fn": "brake_pad_presence", + "num_samples": 10, + "gap": "20d"}, + "generate_feature_matrix": { + "target_dataframe_name": "turbines", + "cutoff_time_in_index": True, + "agg_primitives": [ + "count", + "sum", + "max"], + "verbose": True}, + "generate_train_test_split": {}, + "fit_pipeline": {}, + "evaluate": {}} + + def test_initialize_class(self): + _ = Zephyr() + + def test_generate_entityset(self): + zephyr = Zephyr() + zephyr.generate_entityset( + **self.__class__.kwargs["generate_entityset"]) + es = zephyr.get_entityset() + assert es is not None + assert es.id == 'pidata' + + def test_generate_label_times(self): + zephyr = Zephyr() + zephyr.generate_entityset( + **self.__class__.kwargs["generate_entityset"]) + zephyr.generate_label_times( + **self.__class__.kwargs["generate_label_times"]) + label_times = zephyr.get_label_times(visualize=False) + assert label_times is not None + + def test_generate_feature_matrix_and_labels(self): + zephyr = Zephyr() + zephyr.generate_entityset( + **self.__class__.kwargs["generate_entityset"]) + zephyr.generate_label_times( + **self.__class__.kwargs["generate_label_times"]) + zephyr.generate_feature_matrix( + **self.__class__.kwargs["generate_feature_matrix"]) + feature_matrix, label_col_name, features = zephyr.get_feature_matrix() + assert feature_matrix is not None + assert label_col_name in feature_matrix.columns + assert features is not None + + def test_generate_train_test_split(self): + zephyr = Zephyr() + zephyr.generate_entityset( + **self.__class__.kwargs["generate_entityset"]) + zephyr.generate_label_times( + **self.__class__.kwargs["generate_label_times"]) + zephyr.generate_feature_matrix( + **self.__class__.kwargs["generate_feature_matrix"]) + zephyr.generate_train_test_split( + **self.__class__.kwargs["generate_train_test_split"]) + train_test_split = zephyr.get_train_test_split() + assert train_test_split is not None + X_train, X_test, y_train, y_test = train_test_split + assert isinstance(X_train, pd.DataFrame) + assert isinstance(X_test, pd.DataFrame) + assert isinstance(y_train, pd.Series) + assert isinstance(y_test, pd.Series) + + def test_set_train_test_split(self): + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + train_test_split = zephyr.get_train_test_split() + assert train_test_split is not None + X_train, X_test, y_train, y_test = train_test_split + assert isinstance(X_train, pd.DataFrame) + assert isinstance(X_test, pd.DataFrame) + assert isinstance(y_train, list) + assert isinstance(y_test, list) + + def test_fit_pipeline_no_visual(self): + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + output = zephyr.fit_pipeline(**self.__class__.kwargs["fit_pipeline"]) assert output is None - - def test_predict(self): - self.zephyr.fit(self.train, self.train_y) - - predicted = self.zephyr.predict(self.test) - - assert self.test_y == predicted - - def test_predict_visual(self): - self.zephyr.fit(self.train, self.train_y) - - predicted, output = self.zephyr.predict(self.test, visual=True) - - # predictions - assert self.test_y == predicted - - # visualization + pipeline = zephyr.get_fitted_pipeline() + assert pipeline is not None + + def test_fit_pipeline_visual(self): + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + output = zephyr.fit_pipeline( + visual=True, **self.__class__.kwargs["fit_pipeline"]) assert isinstance(output, dict) assert list(output.keys()) == ['threshold', 'scores'] + pipeline = zephyr.get_fitted_pipeline() + assert pipeline is not None def test_predict_no_visual(self): - zephyr = Zephyr(['xgboost.XGBClassifier']) - - zephyr.fit(self.train, self.train_y) - - predicted = zephyr.predict(self.test, visual=True) - assert len(self.test_y) == len(predicted) - - def test_fit_predict(self): - predicted = self.zephyr.fit_predict(self.random, self.random_y) + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + zephyr.fit_pipeline(**self.__class__.kwargs["fit_pipeline"]) + predicted = zephyr.predict() + _, _, _, test_y = self.base_train_test_split() + assert predicted == test_y + def test_predict_visual(self): + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + zephyr.fit_pipeline(**self.__class__.kwargs["fit_pipeline"]) + predicted, output = zephyr.predict(visual=True) assert isinstance(predicted, list) - - def test_save_load(self, tmpdir): - path = os.path.join(tmpdir, 'some_path.pkl') - self.zephyr.save(path) - - new_zephyr = Zephyr.load(path) - assert new_zephyr == self.zephyr - - def test_load_failed(self, tmpdir): - path = os.path.join(tmpdir, 'some_path.pkl') - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'wb') as pickle_file: - pickle.dump("something", pickle_file) - - with pytest.raises(ValueError): - Zephyr.load(path) + assert len(predicted) == len(self.test_y) + assert isinstance(output, dict) + assert list(output.keys()) == ['threshold', 'scores'] def test_evaluate(self): - self.zephyr.fit(self.test, self.test_y) - scores = self.zephyr.evaluate(X=self.test, y=self.test_y) - - expected = pd.Series({ - 'accuracy': 1.0, - 'f1': 1.0, - 'recall': 1.0, - 'precision': 1.0, - }) - pd.testing.assert_series_equal(expected, scores) - - def test_evaluate_fit(self): - scores = self.zephyr.evaluate( - X=self.test, - y=self.test_y, - fit=True, - ) - - expected = pd.Series({ - 'accuracy': 1.0, - 'f1': 1.0, - 'recall': 1.0, - 'precision': 1.0, - }) - pd.testing.assert_series_equal(expected, scores) - - def test_evaluate_previously_fitted_with_fit_true(self): - self.zephyr.fit(self.train, self.train_y) - - scores = self.zephyr.evaluate( - X=self.test, - y=self.test_y, - fit=True - ) - - expected = pd.Series({ - 'accuracy': 1.0, - 'f1': 1.0, - 'recall': 1.0, - 'precision': 1.0, - }) - pd.testing.assert_series_equal(expected, scores) - - def test_evaluate_train_data(self): - scores = self.zephyr.evaluate( - X=self.test, - y=self.test_y, - fit=True, - train_X=self.train, - train_y=self.train_y - ) - - expected = pd.Series({ - 'accuracy': 1.0, - 'f1': 1.0, - 'recall': 1.0, - 'precision': 1.0, - }) - pd.testing.assert_series_equal(expected, scores) + zephyr = Zephyr() + zephyr.set_train_test_split(*self.base_train_test_split()) + zephyr.fit_pipeline(**self.__class__.kwargs["fit_pipeline"]) + scores = zephyr.evaluate(metrics=[ + "sklearn.metrics.accuracy_score", + "sklearn.metrics.precision_score", + "sklearn.metrics.f1_score", + "sklearn.metrics.recall_score" + ]) + assert isinstance(scores, dict) + assert all(metric in scores for metric in [ + "sklearn.metrics.accuracy_score", + "sklearn.metrics.precision_score", + "sklearn.metrics.f1_score", + "sklearn.metrics.recall_score" + ]) + + def test_get_entityset_types(self): + zephyr = Zephyr() + entityset_types = zephyr.GET_ENTITYSET_TYPES() + assert isinstance(entityset_types, dict) + assert "pidata" in entityset_types + assert "scada" in entityset_types + assert "vibrations" in entityset_types + for es_type, info in entityset_types.items(): + assert isinstance(info, dict) + assert "obj" in info + assert "desc" in info + assert isinstance(info["obj"], str) + assert isinstance(info["desc"], str) + + def test_get_labeling_functions(self): + zephyr = Zephyr() + labeling_functions = zephyr.GET_LABELING_FUNCTIONS() + assert isinstance(labeling_functions, dict) + assert "brake_pad_presence" in labeling_functions + for func_name, info in labeling_functions.items(): + assert isinstance(info, dict) + assert "obj" in info + assert "desc" in info + assert callable(info["obj"]) + assert isinstance(info["desc"], str) + + def test_get_evaluation_metrics(self): + zephyr = Zephyr() + evaluation_metrics = zephyr.GET_EVALUATION_METRICS() + assert isinstance(evaluation_metrics, dict) + expected_metrics = DEFAULT_METRICS + for metric in expected_metrics: + assert metric in evaluation_metrics + for metric_name, info in evaluation_metrics.items(): + assert isinstance(info, dict) + assert "obj" in info + assert "desc" in info + assert isinstance(info["obj"], MLBlock) + assert hasattr(info["obj"], "metadata") + assert isinstance(info["desc"], str) diff --git a/tests/test_entityset.py b/tests/test_entityset.py index ae2dd45..ab3ec76 100644 --- a/tests/test_entityset.py +++ b/tests/test_entityset.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from zephyr_ml import create_pidata_entityset, create_scada_entityset +from zephyr_ml import _create_entityset @pytest.fixture @@ -119,6 +119,14 @@ def scada_dfs(base_dfs): return {**base_dfs, 'scada': scada_df} +def create_pidata_entityset(pidata_dfs): + return _create_entityset(pidata_dfs, es_type="pidata") + + +def create_scada_entityset(scada_dfs): + return _create_entityset(scada_dfs, es_type="scada") + + def test_create_pidata_missing_entities(pidata_dfs): error_msg = 'Missing dataframes for entities notifications.' @@ -206,7 +214,7 @@ def test_missing_time_indices(pidata_dfs): def test_default_create_pidata_entityset(pidata_dfs): es = create_pidata_entityset(pidata_dfs) - assert es.id == 'PI data' + assert es.id == 'pidata' assert set(es.dataframe_dict.keys()) == set( ['alarms', 'turbines', 'stoppages', 'work_orders', 'notifications', 'pidata']) @@ -214,6 +222,6 @@ def test_default_create_pidata_entityset(pidata_dfs): def test_default_create_scada_entityset(scada_dfs): es = create_scada_entityset(scada_dfs) - assert es.id == 'SCADA data' + assert es.id == 'scada' assert set(es.dataframe_dict.keys()) == set( ['alarms', 'turbines', 'stoppages', 'work_orders', 'notifications', 'scada']) diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index 324e82f..5baf7dd 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from zephyr_ml import create_pidata_entityset, create_scada_entityset +from zephyr_ml import _create_entityset from zephyr_ml.feature_engineering import process_signals @@ -122,12 +122,12 @@ def scada_dfs(base_dfs): @pytest.fixture def pidata_es(pidata_dfs): - return create_pidata_entityset(pidata_dfs) + return _create_entityset(pidata_dfs, "pidata") @pytest.fixture def scada_es(scada_dfs): - return create_scada_entityset(scada_dfs) + return _create_entityset(scada_dfs, "scada") @pytest.fixture @@ -170,8 +170,10 @@ def test_process_signals_pidata(pidata_es, transformations, aggregations): "fft.mean.mean_value": [9872, None, 559] }) expected['COD_ELEMENT'] = expected['COD_ELEMENT'].astype('category') - expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype('float64') - processed['fft.mean.mean_value'] = processed['fft.mean.mean_value'].astype('float64') + expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype( + 'float64') + processed['fft.mean.mean_value'] = processed['fft.mean.mean_value'].astype( + 'float64') assert pidata_es['pidata_processed'].shape[0] == 3 assert pidata_es['pidata_processed'].shape[1] == 4 @@ -202,8 +204,10 @@ def test_process_signals_pidata_replace(pidata_es, transformations, aggregations "fft.mean.mean_value": [9872, None, 559] }) expected['COD_ELEMENT'] = expected['COD_ELEMENT'].astype('category') - expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype('float64') - processed['fft.mean.mean_value'] = processed['fft.mean.mean_value'].astype('float64') + expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype( + 'float64') + processed['fft.mean.mean_value'] = processed['fft.mean.mean_value'].astype( + 'float64') assert pidata_es['pidata'].shape[0] == 3 assert pidata_es['pidata'].shape[1] == 4 @@ -233,7 +237,8 @@ def test_process_signals_scada(scada_es, transformations, aggregations): "fft.mean.mean_value": [1002, None, 56.8] }) expected['COD_ELEMENT'] = expected['COD_ELEMENT'].astype('category') - expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype('float64') + expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype( + 'float64') after = scada_es['scada'].copy() assert scada_es['scada_processed'].shape[0] == 3 @@ -263,7 +268,8 @@ def test_process_signals_scada_replace(scada_es, transformations, aggregations): "fft.mean.mean_value": [1002, None, 56.8] }) expected['COD_ELEMENT'] = expected['COD_ELEMENT'].astype('category') - expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype('float64') + expected['fft.mean.mean_value'] = expected['fft.mean.mean_value'].astype( + 'float64') assert scada_es['scada'].shape[0] == 3 assert scada_es['scada'].shape[1] == 4 diff --git a/tests/test_metadata.py b/tests/test_metadata.py index ddb816a..8d8f923 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -10,7 +10,8 @@ def test_default_scada_mapped_kwargs(): def test_default_pidata_mapped_kwargs(): - expected = {**DEFAULT_ES_KWARGS, 'pidata': DEFAULT_ES_TYPE_KWARGS['pidata']} + expected = {**DEFAULT_ES_KWARGS, + 'pidata': DEFAULT_ES_TYPE_KWARGS['pidata']} actual = get_mapped_kwargs('pidata') assert expected == actual diff --git a/zephyr_ml/__init__.py b/zephyr_ml/__init__.py index 15c12ac..b633a49 100644 --- a/zephyr_ml/__init__.py +++ b/zephyr_ml/__init__.py @@ -9,8 +9,9 @@ import os from zephyr_ml.core import Zephyr -from zephyr_ml.entityset import create_pidata_entityset, create_scada_entityset +from zephyr_ml.entityset import VALIDATE_DATA_FUNCTIONS, _create_entityset from zephyr_ml.labeling import DataLabeler -MLBLOCKS_PRIMITIVES = os.path.join(os.path.dirname(__file__), 'primitives', 'jsons') -MLBLOCKS_PIPELINES = os.path.join(os.path.dirname(__file__), 'pipelines') +MLBLOCKS_PRIMITIVES = os.path.join( + os.path.dirname(__file__), "primitives", "jsons") +MLBLOCKS_PIPELINES = os.path.join(os.path.dirname(__file__), "pipelines") diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py index e89cb3b..309e08b 100644 --- a/zephyr_ml/core.py +++ b/zephyr_ml/core.py @@ -1,270 +1,1211 @@ -"""Zephyr Core module. - -This module defines the Zephyr Class, which is responsible for the -model training and inference with the underlying MLBlocks pipelines. -""" +import copy import json import logging import os -import pickle -from functools import partial -from typing import List, Union +from functools import wraps +from inspect import getfullargspec +import composeml as cp +import featuretools as ft import numpy as np import pandas as pd -from mlblocks import MLPipeline -from sklearn import metrics +from mlblocks import MLBlock, MLPipeline +from sklearn.model_selection import train_test_split + +from zephyr_ml.entityset import VALIDATE_DATA_FUNCTIONS, _create_entityset +from zephyr_ml.feature_engineering import process_signals +from zephyr_ml.labeling import get_labeling_functions, get_labeling_functions_map + +DEFAULT_METRICS = [ + "sklearn.metrics.accuracy_score", + "sklearn.metrics.precision_score", + "sklearn.metrics.f1_score", + "sklearn.metrics.recall_score", + "zephyr_ml.primitives.postprocessing.confusion_matrix", + "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", +] LOGGER = logging.getLogger(__name__) -_REGRESSION_METRICS = { - 'mae': metrics.mean_absolute_error, - 'mse': metrics.mean_squared_error, - 'r2': metrics.r2_score, -} +class GuideHandler: + + def __init__(self, producers_and_getters, set_methods): + self.cur_term = 0 + self.current_step = -1 + self.start_point = -1 + self.producers_and_getters = producers_and_getters + self.set_methods = set_methods + + self.producer_to_step_map = {} + self.getter_to_step_map = {} + + self.terms = [] + for idx, (producers, getters) in enumerate(self.producers_and_getters): + self.terms.append(-1) + + for prod in producers: + self.producer_to_step_map[prod.__name__] = idx + + for get in getters: + self.getter_to_step_map[get.__name__] = idx + + def get_necessary_steps(self, actual_next_step): + step_strs = [] + for step in range(self.current_step, actual_next_step): + option_strs = [] + for opt in self.producers_and_getters[step][0]: + option_strs.append(opt.__name__) + step_strs.append(f"{step}. {' or '.join(option_strs)}") + return "\n".join(step_strs) + + def get_get_steps_in_between(self, cur_step, next_step): + step_strs = [] + for step in range(cur_step + 1, next_step): + step_strs.append( + f"{step} {self.producers_and_getters[step][1][0]}") + return step_strs + + def get_last_up_to_date(self, next_step): + latest_up_to_date = 0 + for step in range(next_step): + if self.terms[step] == self.cur_term: + latest_up_to_date = step + return latest_up_to_date + + def join_steps(self, step_strs): + return "\n".join(step_strs) + + def get_steps_in_between(self, cur_step, next_step): + step_strs = [] + for step in range(cur_step + 1, next_step): + option_strs = [] + for opt in self.producers_and_getters[step][0]: + option_strs.append(opt.__name__) + step_strs.append(f"{step}. {' or '.join(option_strs)}") + return step_strs + + def perform_producer_step(self, zephyr, method, *method_args, **method_kwargs): + step_num = self.producer_to_step_map[method.__name__] + res = method(zephyr, *method_args, **method_kwargs) + self.current_step = step_num + self.terms[step_num] = self.cur_term + return res -_CLASSIFICATION_METRICS = { - 'accuracy': metrics.accuracy_score, - 'f1': metrics.f1_score, - 'recall': metrics.recall_score, - 'precision': metrics.precision_score, -} + def try_log_skipping_steps_warning(self, name, next_step): + steps_skipped = self.get_steps_in_between(self.current_step, next_step) + if len(steps_skipped) > 0: + necc_steps = self.join_steps(steps_skipped) + LOGGER.warning( + f"Performing {name}. You are skipping the following steps:\n{necc_steps}") -METRICS = _CLASSIFICATION_METRICS + def try_log_making_stale_warning(self, name, next_step): + next_next_step = next_step + 1 + prod_steps = f"step {next_next_step}: \ + {' or '.join(self.producers_and_getters[next_next_step][0])}" + # add later set methods + get_steps = self.join_steps( + self.get_get_steps_in_between( + next_step, self.current_step + 1)) + + LOGGER.warning(f"Performing {name}. You are beginning a new iteration.\ + Any data returned by the following get methods will be \ + considered stale:\n{get_steps}. To continue with this \ + iteration, please perform \n{prod_steps}") + + def log_get_inconsistent_warning(self, name, next_step): + prod_steps = f"{next_step}. \ + {' or '.join(self.producers_and_getters[next_step][0])}" + latest_up_to_date = self.get_last_up_to_date(next_step) + LOGGER.warning(f"Unable to perform {name} because {prod_steps} has not \ + been run yet. Run steps starting at or before \ + {latest_up_to_date} ") + + def log_get_stale_warning(self, name, next_step): + latest_up_to_date = self.get_last_up_to_date(next_step) + LOGGER.warning(f"Performing {name}. This data is potentially stale. \ + Re-run steps starting at or before \ + {latest_up_to_date} to ensure data is up to date.") + + # tries to perform step if possible -> warns that data might be stale + + def try_perform_forward_producer_step(self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + if name in self.set_methods: # set method will update start point and start new iteration + self.try_log_skipping_steps_warning(name, next_step) + self.start_point = next_step + self.cur_term += 1 + # next_step == 0, set method (already warned), or previous step is up to term + res = self.perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + + # next_step == 0, set method, or previous step is up to term + + def try_perform_backward_producer_step(self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + # starting new iteration + self.cur_term += 1 + if next_step == 0 or name in self.set_methods: + self.start_point = next_step + else: # key method + # mark everything from start point to next step as current term + for i in range(self.start_point, next_step): + if self.terms[i] != -1: + self.terms[i] = self.cur_term + + self.try_log_making_stale_warning(next_step) + res = self.perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + + def try_perform_producer_step(self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + if next_step >= self.current_step: + res = self.try_perform_forward_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: + res = self.try_perform_backward_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + + # dont update current step or terms + + def try_perform_inconsistent_producer_step( # add using stale and overwriting + self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + next_step = self.producer_to_step_map[name] + # inconsistent forward step: performing key method but previous step is not up to date + if next_step >= self.current_step and self.terms[next_step-1] != self.cur_term: + corr_set_method = self.producers_and_getters[next_step][0][1].__name__ + prev_step = next_step-1 + prev_set_method = self.producers_and_getters[prev_step][0][1].__name__ + prev_key_method = self.producers_and_getters[prev_step][0][0].__name__ + LOGGER.warning(f"Unable to perform {name} because you are performing a key method at\ + step {next_step} but the result of the previous step, \ + step {prev_step}, is STALE.\ + If you already have the data for step {next_step}, \ + you can use the corresponding set method: {corr_set_method}.\ + Otherwise, please perform step {prev_step} \ + with {prev_key_method} or {prev_set_method}.") + # inconsistent backward step: performing set method at nonzero step + # elif next_step < self.current_step and name in self.set_method: + # first_set_method = self.producers_and_getters[0][0][1].__name__ + # corr_key_method = self.producers_and_getters[next_step][0][0].__name__ + # LOGGER.warning(f"Unable to perform {name} because you are going backwards \ + # and performing step {next_step} with a set method.\ + # You can only perform a backwards step with a set \ + # method at step 0: {first_set_method}.\ + # If you would like to perform step {next_step}, \ + # please use the corresponding key method: {corr_key_method}.") + # inconsistent backward step: performing key method but previous step is not up to date + elif next_step < self.current_step and self.terms[next_step-1] != self.cur_term: + prev_step = next_step-1 + prev_key_method = self.producers_and_getters[prev_step][0][0].__name__ + corr_set_method = self.producers_and_getters[next_step][0][1].__name__ + prev_get_method = self.producers_and_getters[prev_step][1][0].__name__ + prev_set_method = self.producers_and_getters[prev_step][0][1].__name__ + LOGGER.warning(f"Unable to perform {name} because you are going \ + backwards and starting a new iteration by\ + performing a key method at step {next_step} \ + but the result of the previous step,\ + step {prev_step}, is STALE.\ + If you want to use the STALE result of the PREVIOUS step, \ + you can call {prev_get_method} to get the data, then\ + {prev_set_method} to set the data, and then recall this method.\ + If you want to regenerate the data of the PREVIOUS step, \ + please call {prev_key_method}, and then recall this method.\ + If you already have the data for THIS step, you can \ + call {corr_set_method} to set the data.\ + ") + + def try_perform_getter_step(self, zephyr, method, *method_args, **method_kwargs): + name = method.__name__ + # either inconsistent, stale, or up to date + step_num = self.getter_to_step_map[name] + step_term = self.terms[step_num] + if step_term == -1: + self.log_get_inconsistent_warning(step_num) + elif step_term == self.cur_term: + res = method(zephyr, *method_args, **method_kwargs) + return res + else: + self.log_get_stale_warning(step_num) + res = method(zephyr, *method_args, **method_kwargs) + return res + + def guide_step(self, zephyr, method, *method_args, **method_kwargs): + method_name = method.__name__ + if method_name in self.producer_to_step_map: + # up-todate + next_step = self.producer_to_step_map[method_name] + if (next_step == 0 or # 0 step always valid, starting new iteration + # set method always valid, but will update start point and start new iteration + method_name in self.set_methods or + # key method valid if previous step is up to date + self.terms[next_step-1] == self.cur_term): + # forward step only valid if set method or key method w/ no skips + res = self.try_perform_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: # stale or inconsistent + res = self.try_perform_inconsistent_producer_step( + zephyr, method, *method_args, **method_kwargs) + return res + elif method_name in self.getter_to_step_map: + res = self.try_perform_getter_step( + zephyr, method, *method_args, **method_kwargs) + return res + else: + print(f"Method {method_name} does not need to be wrapped") + + +def guide(method): + + @wraps(method) + def guided_step(self, *method_args, **method_kwargs): + return self.guide_handler.guide_step(self, method, *method_args, **method_kwargs) + + return guided_step class Zephyr: """Zephyr Class. - The Zephyr Class provides the main machine learning pipeline functionalities - of Zephyr and is responsible for the interaction with the underlying - MLBlocks pipelines. - - Args: - pipeline (str, dict or MLPipeline): - Pipeline to use. It can be passed as: - * An ``str`` with a path to a JSON file. - * An ``str`` with the name of a registered pipeline. - * An ``MLPipeline`` instance. - * A ``dict`` with an ``MLPipeline`` specification. - hyperparameters (dict): - Additional hyperparameters to set to the Pipeline. + The Zephyr Class supports all the steps of the predictive engineering workflow + for wind farm operations data. It manages user state and handles entityset creation, labeling, + feature engineering, model training and evaluation. """ - DEFAULT_PIPELINE = 'xgb_classifier' - def _get_mlpipeline(self): - pipeline = self._pipeline - if isinstance(pipeline, str) and os.path.isfile(pipeline): - with open(pipeline) as json_file: - pipeline = json.load(json_file) + def __init__(self): + """Initialize a new Zephyr instance.""" + self._entityset = None - mlpipeline = MLPipeline(pipeline) - if self._hyperparameters: - mlpipeline.set_hyperparameters(self._hyperparameters) + self._label_times = None + self._label_times_meta = None - return mlpipeline + self._label_col_name = "label" + self._feature_matrix = None + + self._pipeline = None + + self._X_train = None + self._X_test = None + self._y_train = None + self._y_test = None + + # tuple of 2 arrays: producers and attributes + step_order = [ + ([ + self.generate_entityset, self.set_entityset], [ + self.get_entityset]), ([ + self.generate_label_times, self.set_label_times], [ + self.get_label_times]), ([ + self.generate_feature_matrix, self.set_feature_matrix], [ + self.get_feature_matrix]), ([ + self.generate_train_test_split, self.set_train_test_split], [ + self.get_train_test_split]), ([ + self.fit_pipeline, self.set_fitted_pipeline], [ + self.get_fitted_pipeline]), ([ + self.predict, self.evaluate], [])] + set_methods = set([self.set_entityset.__name__, + self.set_label_times.__name__, + self.set_feature_matrix.__name__, + self.set_train_test_split.__name__, + self.set_fitted_pipeline.__name__]) + self.guide_handler = GuideHandler(step_order, set_methods) + + def GET_ENTITYSET_TYPES(self): + """Get the supported entityset types and their required dataframes/columns. + + Returns: + dict: A dictionary mapping entityset types (PI/SCADA/Vibrations) to their + descriptions and value. + """ + info_map = {} + for es_type, val_fn in VALIDATE_DATA_FUNCTIONS.items(): + info_map[es_type] = {"obj": es_type, + "desc": " ".join((val_fn.__doc__.split()))} + + return info_map + + def GET_LABELING_FUNCTIONS(self): + """Get the available predefined labeling functions. + + Returns: + dict: A dictionary mapping labeling function names to their + descriptions and implementations. + """ + return get_labeling_functions() + + def GET_EVALUATION_METRICS(self): + """Get the available evaluation metrics. + + Returns: + dict: A dictionary mapping metric names to their descriptions + and MLBlock instances. + """ + info_map = {} + for metric in DEFAULT_METRICS: + primitive = self._get_ml_primitive(metric) + info_map[metric] = {"obj": primitive, + "desc": primitive.metadata["description"]} + return info_map + + @guide + def generate_entityset( + self, + dfs, + es_type, + custom_kwargs_mapping=None, + signal_dataframe_name=None, + signal_column=None, + signal_transformations=None, + signal_aggregations=None, + signal_window_size=None, + signal_replace_dataframe=False, + **sigpro_kwargs): + """Generate an entityset from input dataframes with optional signal processing. + + Args: + dfs (dict): Dictionary mapping entity names to pandas DataFrames. + es_type (str): Type of signal data, either 'SCADA' or 'PI'. + custom_kwargs_mapping (dict, optional): Custom keyword arguments + for entityset creation. + signal_dataframe_name (str, optional): Name of dataframe containing + signal data to process. + signal_column (str, optional): Name of column containing signal values to process. + signal_transformations (list[dict], optional): List of transformation + primitives to apply. + signal_aggregations (list[dict], optional): List of aggregation primitives to apply. + signal_window_size (str, optional): Size of window for signal binning (e.g. '1h'). + signal_replace_dataframe (bool, optional): Whether to replace + original signal dataframe. + **sigpro_kwargs: Additional keyword arguments for signal processing. + + Returns: + featuretools.EntitySet: EntitySet containing the processed data and relationships. + """ + entityset = _create_entityset(dfs, es_type, custom_kwargs_mapping) + + # perform signal processing + if signal_dataframe_name is not None and signal_column is not None: + if signal_transformations is None: + signal_transformations = [] + if signal_aggregations is None: + signal_aggregations = [] + process_signals( + entityset, + signal_dataframe_name, + signal_column, + signal_transformations, + signal_aggregations, + signal_window_size, + signal_replace_dataframe, + **sigpro_kwargs) + + self._entityset = entityset + return self._entityset + + @guide + def set_entityset(self, entityset=None, es_type=None, entityset_path=None, + custom_kwargs_mapping=None): + """Set the entityset for this Zephyr instance. + + Args: + entityset (featuretools.EntitySet, optional): An existing entityset to use. + es_type (str, optional): The type of entityset (pi/scada/vibrations). + entityset_path (str, optional): Path to a saved entityset to load. + custom_kwargs_mapping (dict, optional): Custom keyword arguments for validation. + + Raises: + ValueError: If no entityset is provided through any of the parameters. + """ + if entityset_path is not None: + entityset = ft.read_entityset(entityset_path) + + if entityset is None: + raise ValueError( + "No entityset passed in. Please pass in an entityset object\ + via the entityset parameter or an entityset path via the \ + entityset_path parameter.") + + dfs = entityset.dataframe_dict + + validate_func = VALIDATE_DATA_FUNCTIONS[es_type] + validate_func(dfs, custom_kwargs_mapping) + + self._entityset = entityset + + @guide + def get_entityset(self): + """Get the current entityset. + + Returns: + featuretools.EntitySet: The current entityset. + + Raises: + ValueError: If no entityset has been set. + """ + if self._entityset is None: + raise ValueError( + "No entityset has been created or set in this instance.") + + return self._entityset - def __init__(self, pipeline: Union[str, dict, MLPipeline] = None, - hyperparameters: dict = None): - self._pipeline = pipeline or self.DEFAULT_PIPELINE - self._hyperparameters = hyperparameters - self._mlpipeline = self._get_mlpipeline() - self._fitted = False - - def __eq__(self, other): - return ( - isinstance(other, self.__class__) and - self._pipeline == other._pipeline and - self._hyperparameters == other._hyperparameters and - self._fitted == other._fitted + @guide + def generate_label_times( + self, labeling_fn, num_samples=-1, subset=None, column_map={}, verbose=False, thresh=None, + window_size=None, minimum_data=None, maximum_data=None, gap=None, drop_empty=True, **kwargs + ): + """Generate label times using a labeling function. + + This method applies a labeling function to the entityset to generate labels at specific + timestamps. The labeling function can be either a predefined one (specified by name) or + a custom callable. + + Args: + labeling_fn (callable or str): Either a custom labeling function or the + name of a predefined function (e.g. 'brake_pad_presence'). + Predefined functions like brake_pad_presence analyze specific patterns + in the data (e.g. brake pad mentions in stoppage comments) and + return a tuple containing: + 1) A label generation function that processes data slices + 2) A denormalized dataframe containing the source data + 3) Metadata about the labeling process (e.g. target entity, time index) + num_samples (int, optional): Number of samples to generate. -1 for all. Defaults to -1. + subset (int or float, optional): Number or fraction of samples to randomly select. + column_map (dict, optional): Mapping of column names for the labeling function. + verbose (bool, optional): Whether to display progress. Defaults to False. + thresh (float, optional): Threshold for label binarization. If None, tries to + use threshold value from labeling function metadata, if any. + window_size (str, optional): Size of the window for label generation (e.g. '1h'). + If None, tries to use window size value from labeling function metadata, if any. + minimum_data (str, optional): Minimum data required before cutoff time. + maximum_data (str, optional): Maximum data required after cutoff time. + gap (str, optional): Minimum gap between consecutive labels. + drop_empty (bool, optional): Whether to drop windows with no events. Defaults to True. + **kwargs: Additional arguments passed to the label generation function. + + Returns: + tuple: (composeml.LabelTimes, dict) The generated label times and metadata. + Label times contain the generated labels at specific timestamps. + Metadata contains information about the labeling process. + + Raises: + ValueError: If labeling_fn is a string but not a recognized predefined function. + AssertionError: If entityset has not been generated or set or labeling_fn is + not a string and not callable. + """ + assert self._entityset is not None, "entityset has not been set" + + if isinstance(labeling_fn, str): # get predefined labeling function + labeling_fn_map = get_labeling_functions_map() + if labeling_fn in labeling_fn_map: + labeling_fn = labeling_fn_map[labeling_fn] + else: + raise ValueError( + f"Unrecognized name argument:{labeling_fn}. \ + Call get_predefined_labeling_functions to \ + view predefined labeling functions" + ) + + assert callable(labeling_fn), "Labeling function is not callable" + + labeling_function, df, meta = labeling_fn(self._entityset, column_map) + + data = df + if isinstance(subset, float) or isinstance(subset, int): + data = data.sample(subset) + + target_entity_index = meta.get("target_entity_index") + time_index = meta.get("time_index") + thresh = meta.get("thresh") if thresh is None else thresh + window_size = meta.get( + "window_size") if window_size is None else window_size + + label_maker = cp.LabelMaker( + labeling_function=labeling_function, + target_dataframe_name=target_entity_index, + time_index=time_index, + window_size=window_size, ) - def _get_outputs_spec(self, default=True): - outputs_spec = ["default"] if default else [] + kwargs = {**meta, **kwargs} + kwargs = { + k: kwargs.get(k) + for k in set(getfullargspec(label_maker.search)[0]) + if kwargs.get(k) is not None + } + label_times = label_maker.search( + data.sort_values(time_index), num_samples, minimum_data=minimum_data, + maximum_data=maximum_data, gap=gap, drop_empty=drop_empty, verbose=verbose, **kwargs + ) + if thresh is not None: + label_times = label_times.threshold(thresh) - try: - visual_names = self._mlpipeline.get_output_names('visual') - outputs_spec.append('visual') - except ValueError: - visual_names = [] + self._label_times = label_times + self._label_col_name = "label" + self._label_times_meta = meta - return outputs_spec, visual_names + return label_times, meta + + @guide + def set_label_times(self, label_times, label_col_name, meta=None): + """Set the label times for this Zephyr instance. + + Args: + label_times (composeml.LabelTimes): Label times. + label_col_name (str): Name of the label column. + meta (dict, optional): Additional metadata about the labels. + """ + assert (isinstance(label_times, cp.LabelTimes)) + self._label_times = label_times + self._label_col_name = label_col_name + self._label_times_meta = meta + + @guide + def get_label_times(self, visualize=False): + """Get the current label times. + + Args: + visualize (bool, optional): Whether to display a distribution plot. Defaults to False. + + Returns: + tuple: (composeml.LabelTimes, dict) The label times and metadata. + """ + if visualize: + cp.label_times.plots.LabelPlots(self._label_times).distribution() + return self._label_times, self._label_times_meta + + @guide + def generate_feature_matrix( + self, + target_dataframe_name=None, + instance_ids=None, + agg_primitives=None, + trans_primitives=None, + groupby_trans_primitives=None, + allowed_paths=None, + max_depth=2, + ignore_dataframes=None, + ignore_columns=None, + primitive_options=None, + seed_features=None, + drop_contains=None, + drop_exact=None, + where_primitives=None, + max_features=-1, + cutoff_time_in_index=False, + save_progress=None, + features_only=False, + training_window=None, + approximate=None, + chunk_size=None, + n_jobs=1, + dask_kwargs=None, + verbose=False, + return_types=None, + progress_callback=None, + include_cutoff_time=True, + add_interesting_values=False, + max_interesting_values=5, + interesting_dataframe_name=None, + interesting_values=None, + signal_dataframe_name=None, + signal_column=None, + signal_transformations=None, + signal_aggregations=None, + signal_window_size=None, + signal_replace_dataframe=False, + **sigpro_kwargs): + """Generate a feature matrix using automated feature engineering. + Note that this method creates a deepcopy + of the generated or set entityset within the Zephyr instance + before performing any signal processing or feature generation. + + Args: + target_dataframe_name (str, optional): Name of target entity for feature engineering. + instance_ids (list, optional): List of specific instances to generate features for. + agg_primitives (list, optional): Aggregation primitives to apply. + trans_primitives (list, optional): Transform primitives to apply. + groupby_trans_primitives (list, optional): Groupby transform primitives to apply. + allowed_paths (list, optional): Allowed entity paths for feature generation. + max_depth (int, optional): Maximum allowed depth of entity relationships. + Defaults to 2. + ignore_dataframes (list, optional): Dataframes to ignore during feature generation. + ignore_columns (dict, optional): Columns to ignore per dataframe. + primitive_options (dict, optional): Options for specific primitives. + seed_features (list, optional): Seed features to begin with. + drop_contains (list, optional): Drop features containing these substrings. + drop_exact (list, optional): Drop features exactly matching these names. + where_primitives (list, optional): Primitives to use in where clauses. + max_features (int, optional): Maximum number of features to return. -1 for all. + cutoff_time_in_index (bool, optional): Include cutoff time in the index. + save_progress (str, optional): Path to save progress. + features_only (bool, optional): Return only features without calculating values. + training_window (str, optional): Data window to use for training. + approximate (str, optional): Approximation method to use. + chunk_size (int, optional): Size of chunks for parallel processing. + n_jobs (int, optional): Number of parallel jobs. Defaults to 1. + dask_kwargs (dict, optional): Arguments for dask computation. + verbose (bool, optional): Whether to display progress. Defaults to False. + return_types (list, optional): Types of features to return. + progress_callback (callable, optional): Callback for progress updates. + include_cutoff_time (bool, optional): Include cutoff time features. Defaults to True. + add_interesting_values (bool, optional): Add interesting values. Defaults to False. + max_interesting_values (int, optional): Maximum interesting values per column. + interesting_dataframe_name (str, optional): Dataframe for interesting values. + interesting_values (dict, optional): Pre-defined interesting values. + signal_dataframe_name (str, optional): Name of dataframe containing signal data. + signal_column (str, optional): Name of column containing signal values. + signal_transformations (list, optional): Signal transformations to apply. + signal_aggregations (list, optional): Signal aggregations to apply. + signal_window_size (str, optional): Window size for signal processing. + signal_replace_dataframe (bool, optional): Replace original signal dataframe. + **sigpro_kwargs: Additional arguments for signal processing. + + Returns: + tuple: (pd.DataFrame, list, featuretools.EntitySet) + Feature matrix, feature definitions, and the processed entityset. + """ + entityset_copy = copy.deepcopy(self._entityset) + # perform signal processing + if signal_dataframe_name is not None and signal_column is not None: + # first make copy of entityset + if signal_transformations is None: + signal_transformations = [] + if signal_aggregations is None: + signal_aggregations = [] + process_signals( + entityset_copy, + signal_dataframe_name, + signal_column, + signal_transformations, + signal_aggregations, + signal_window_size, + signal_replace_dataframe, + **sigpro_kwargs) + + # add interesting values for where primitives + if add_interesting_values: + entityset_copy.add_interesting_values( + max_values=max_interesting_values, + verbose=verbose, + dataframe_name=interesting_dataframe_name, + values=interesting_values) + + feature_matrix, features = ft.dfs( + entityset=entityset_copy, cutoff_time=self._label_times, + target_dataframe_name=target_dataframe_name, + instance_ids=instance_ids, agg_primitives=agg_primitives, + trans_primitives=trans_primitives, + groupby_trans_primitives=groupby_trans_primitives, + allowed_paths=allowed_paths, max_depth=max_depth, + ignore_dataframes=ignore_dataframes, ignore_columns=ignore_columns, + primitive_options=primitive_options, seed_features=seed_features, + drop_contains=drop_contains, drop_exact=drop_exact, + where_primitives=where_primitives, max_features=max_features, + cutoff_time_in_index=cutoff_time_in_index, + save_progress=save_progress, features_only=features_only, + training_window=training_window, approximate=approximate, + chunk_size=chunk_size, n_jobs=n_jobs, + dask_kwargs=dask_kwargs, verbose=verbose, + return_types=return_types, progress_callback=progress_callback, + include_cutoff_time=include_cutoff_time + ) + self._feature_matrix = self._clean_feature_matrix( + feature_matrix, label_col_name=self._label_col_name) + self._features = features + + return self._feature_matrix, self._features, entityset_copy + + @guide + def get_feature_matrix(self): + """Get the current feature matrix. + + Returns: + tuple: (pd.DataFrame, str, list) The feature matrix, label column name, + and feature definitions. + """ + return self._feature_matrix, self._label_col_name, self._features + + @guide + def set_feature_matrix(self, feature_matrix, labels=None, label_col_name="label"): + """Set the feature matrix for this Zephyr instance. + + Args: + feature_matrix (pd.DataFrame): The feature matrix to use. + labels (array-like, optional): Labels to add to the feature matrix. + label_col_name (str, optional): Name of the label column. Defaults to "label". + """ + assert isinstance(feature_matrix, pd.DataFrame) and ( + labels is not None or + label_col_name in feature_matrix.columns + ) + if labels is not None: + feature_matrix[label_col_name] = labels + self._feature_matrix = self._clean_feature_matrix( + feature_matrix, label_col_name=label_col_name + ) + self._label_col_name = label_col_name - def fit(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], - visual: bool = False, **kwargs): - """Fit the pipeline to the given data. + @guide + def generate_train_test_split( + self, + test_size=None, + train_size=None, + random_state=None, + shuffle=True, + stratify=False, + ): + """Generate a train-test split of the feature matrix. Args: - X (DataFrame): - Input data, passed as a ``pandas.DataFrame`` containing - the feature matrix. - y (Series or ndarray): - Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` - containing the target values. - visual (bool): - If ``True``, capture the ``visual`` named output from the - ``MLPipeline`` and return it as an output. + test_size (float or int, optional): Proportion or absolute size of test set. + train_size (float or int, optional): Proportion or absolute size of training set. + random_state (int, optional): Random seed for reproducibility. + shuffle (bool, optional): Whether to shuffle before splitting. Defaults to True. + stratify (bool or list, optional): Whether to maintain label distribution. + If True, uses labels for stratification. If list, uses those columns. + Defaults to False. + + Returns: + tuple: (X_train, X_test, y_train, y_test) The split feature matrices and labels. + """ + feature_matrix = self._feature_matrix.copy() + labels = feature_matrix.pop(self._label_col_name) + + if not isinstance(stratify, list): + if stratify: + stratify = labels + else: + stratify = None + + X_train, X_test, y_train, y_test = train_test_split( + feature_matrix, + labels, + test_size=test_size, + train_size=train_size, + random_state=random_state, + shuffle=shuffle, + stratify=stratify, + ) + + self._X_train = X_train + self._X_test = X_test + self._y_train = y_train + self._y_test = y_test + + return X_train, X_test, y_train, y_test + + @guide + def set_train_test_split(self, X_train, X_test, y_train, y_test): + """Set the train-test split for this Zephyr instance. + + Args: + X_train (pd.DataFrame): Training features. + X_test (pd.DataFrame): Testing features. + y_train (array-like): Training labels. + y_test (array-like): Testing labels. + """ + self._X_train = X_train + self._X_test = X_test + self._y_train = y_train + self._y_test = y_test + + @guide + def get_train_test_split(self): + """Get the current train-test split. + + Returns: + tuple or None: (X_train, X_test, y_train, y_test) if split exists, None otherwise. + """ + if (self._X_train is None or self._X_test is None or + self._y_train is None or self._y_test is None): + return None + return self._X_train, self._X_test, self._y_train, self._y_test + + @guide + def set_fitted_pipeline(self, pipeline): + """Set a fitted pipeline for this Zephyr instance. + + Args: + pipeline (MLPipeline): The fitted pipeline to use. + """ + self._pipeline = pipeline + + @guide + def fit_pipeline( + self, pipeline="xgb_classifier", pipeline_hyperparameters=None, + X=None, y=None, visual=False, **kwargs): + """Fit a machine learning pipeline. + + Args: + pipeline (str or dict or MLPipeline, optional): Pipeline to use. Can be: + - Name of a registered pipeline (default: "xgb_classifier") + - Path to a JSON pipeline specification + - Dictionary with pipeline specification + - MLPipeline instance + pipeline_hyperparameters (dict, optional): Hyperparameters for the pipeline. + X (pd.DataFrame, optional): Training features. If None, uses stored training set. + y (array-like, optional): Training labels. If None, uses stored training labels. + visual (bool, optional): Whether to return visualization data. Defaults to False. + **kwargs: Additional arguments passed to the pipeline's fit method. + + Returns: + dict or None: If visual=True, returns visualization data dictionary. """ - if not self._fitted: - self._mlpipeline = self._get_mlpipeline() + self._pipeline = self._get_mlpipeline( + pipeline, pipeline_hyperparameters) + + if X is None: + X = self._X_train + if y is None: + y = self._y_train if visual: outputs_spec, visual_names = self._get_outputs_spec(False) else: outputs_spec = None - outputs = self._mlpipeline.fit(X, y, output_=outputs_spec, **kwargs) - self._fitted = True + outputs = self._pipeline.fit(X, y, output_=outputs_spec, **kwargs) if visual and outputs is not None: return dict(zip(visual_names, outputs)) - def predict(self, X: pd.DataFrame, visual: bool = False, **kwargs) -> pd.Series: - """Predict the pipeline to the given data. + @guide + def get_fitted_pipeline(self): + """Get the current fitted pipeline. + + Returns: + MLPipeline: The current fitted pipeline. + """ + return self._pipeline + + @guide + def predict(self, X=None, visual=False, **kwargs): + """Make predictions using the fitted pipeline. Args: - X (DataFrame): - Input data, passed as a ``pandas.DataFrame`` containing - the feature matrix. - visual (bool): - If ``True``, capture the ``visual`` named output from the - ``MLPipeline`` and return it as an output. + X (pd.DataFrame, optional): Features to predict on. If None, uses test set. + visual (bool, optional): Whether to return visualization data. Defaults to False. + **kwargs: Additional arguments passed to the pipeline's predict method. Returns: - Series or ndarray: - Predictions to the input data. + array-like or tuple: Predictions, and if visual=True, also returns visualization data. """ + if X is None: + X = self._X_test if visual: outputs_spec, visual_names = self._get_outputs_spec() else: - outputs_spec = 'default' - - outputs = self._mlpipeline.predict(X, output_=outputs_spec, **kwargs) + outputs_spec = "default" + outputs = self._pipeline.predict(X, output_=outputs_spec, **kwargs) if visual and visual_names: prediction = outputs[0] return prediction, dict(zip(visual_names, outputs[-len(visual_names):])) return outputs - def fit_predict(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], - **kwargs) -> pd.Series: - """Fit the pipeline to the data and then predict targets. - - This method is functionally equivalent to calling ``fit(X, y)`` - and later on ``predict(X)`` but with the difference that - here the ``MLPipeline`` is called only once, using its ``fit`` - method, and the output is directly captured without having - to execute the whole pipeline again during the ``predict`` phase. + @guide + def evaluate( + self, X=None, y=None, metrics=None, global_args=None, + local_args=None, global_mapping=None, local_mapping=None): + """Evaluate the fitted pipeline's performance. Args: - X (DataFrame): - Input data, passed as a ``pandas.DataFrame`` containing - the feature matrix. - y (Series or ndarray): - Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` - containing the target values. + X (pd.DataFrame, optional): Features to evaluate on. If None, uses test set. + y (array-like, optional): True labels. If None, uses test labels. + metrics (list, optional): Metrics to compute. If None, uses DEFAULT_METRICS. + global_args (dict, optional): Arguments passed to all metrics. + local_args (dict, optional): Arguments passed to specific metrics. + global_mapping (dict, optional): Mapping applied to all metric inputs. + local_mapping (dict, optional): Mapping applied to specific metric inputs. Returns: - Series or ndarray: - Predictions to the input data. + dict: A dictionary mapping metric names to their computed values. """ - if not self._fitted: - self._mlpipeline = self._get_mlpipeline() + if X is None: + X = self._X_test + if y is None: + y = self._y_test - result = self._mlpipeline.fit(X, y, output_='default', **kwargs) - self._fitted = True + final_context = self._pipeline.predict(X, output_=-1) - return result + # remap items, if any + if global_mapping is not None: + for cur, new in global_mapping.items(): + if cur in final_context: + cur_item = final_context.pop(cur) + final_context[new] = cur_item - def evaluate(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], fit: bool = False, - train_X: pd.DataFrame = None, train_y: Union[pd.Series, np.ndarray] = None, - metrics: List[str] = METRICS) -> pd.Series: - """Evaluate the performance of the pipeline. + if metrics is None: + metrics = DEFAULT_METRICS - Args: - X (DataFrame): - Input data, passed as a ``pandas.DataFrame`` containing - the feature matrix. - y (Series or ndarray): - Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` - containing the target values. - fit (bool): - Whether to fit the pipeline before evaluating it. - Defaults to ``False``. - train_X (DataFrame): - Training data, passed as a ``pandas.DataFrame`` containing - the feature matrix. - If not given, the pipeline is fitted on ``X``. - train_y (Series or ndarray): - Target data used for training, passed as a ``pandas.Series`` or - ``numpy.ndarray`` containing the target values. - metrics (list): - List of metrics to used passed as a list of strings. - If not given, it defaults to all the metrics. + if global_args is None: + global_args = {} - Returns: - Series: - ``pandas.Series`` containing one element for each - metric applied, with the metric name as index. - """ - if not fit: - method = self._mlpipeline.predict - else: - if not self._fitted: - mlpipeline = self._get_mlpipeline() - else: - mlpipeline = self._mlpipeline + if local_args is None: + local_args = {} - if train_X is not None and train_y is not None: - # fit first and then predict - mlpipeline.fit(train_X, train_y) - method = mlpipeline.predict - else: - # fit and predict at once - method = partial(mlpipeline.fit, y=y, output_='default') + if local_mapping is None: + local_mapping = {} + + results = {} + for metric in metrics: + try: + metric_primitive = self._get_ml_primitive(metric) - result = method(X) + if metric in local_mapping: + metric_context = {} + metric_mapping = local_mapping[metric] + for cur, item in final_context.items(): + new = metric_mapping.get(cur, cur) + metric_context[new] = item + else: + metric_context = final_context - scores = { - metric: METRICS[metric](y, result) - for metric in metrics + if metric in local_args: + metric_args = local_args[metric] + else: + metric_args = {} + + res = metric_primitive.produce( + y_true=self._y_test, **metric_context, **metric_args) + results[metric_primitive.name] = res + except Exception as e: + LOGGER.error( + f"Unable to run evaluation metric: {metric_primitive.name}", + exc_info=e) + self._results = results + return results + + def _clean_feature_matrix(self, feature_matrix, label_col_name="label"): + labels = feature_matrix.pop(label_col_name) + + count_cols = feature_matrix.filter(like="COUNT").columns + feature_matrix[count_cols] = feature_matrix[count_cols].apply( + lambda x: x.astype(np.int64) + ) + + string_cols = feature_matrix.select_dtypes(include="category").columns + feature_matrix = pd.get_dummies(feature_matrix, columns=string_cols) + + feature_matrix[label_col_name] = labels + + return feature_matrix + + def _get_mlpipeline(self, pipeline, hyperparameters=None): + if isinstance(pipeline, str) and os.path.isfile(pipeline): + with open(pipeline) as json_file: + pipeline = json.load(json_file) + + mlpipeline = MLPipeline(pipeline) + if hyperparameters: + mlpipeline.set_hyperparameters(hyperparameters) + + return mlpipeline + + def _get_ml_primitive(self, primitive, hyperparameters=None): + if isinstance(primitive, str) and os.path.isfile(primitive): + with open(primitive) as json_file: + primitive = json.load(json_file) + mlprimitive = MLBlock(primitive) + + if hyperparameters: + mlprimitive.set_hyperparameters(hyperparameters) + return mlprimitive + + def _get_outputs_spec(self, default=True): + outputs_spec = ["default"] if default else [] + + try: + visual_names = self._pipeline.get_output_names("visual") + outputs_spec.append("visual") + except ValueError: + visual_names = [] + + return outputs_spec, visual_names + + +if __name__ == "__main__": + obj = Zephyr() + print(obj.GET_EVALUATION_METRICS()) + alarms_df = pd.DataFrame( + { + "COD_ELEMENT": [0, 0], + "DAT_START": [ + pd.Timestamp("2022-01-01 00:00:00"), + pd.Timestamp("2022-03-01 11:12:13"), + ], + "DAT_END": [ + pd.Timestamp("2022-01-01 13:00:00"), + pd.Timestamp("2022-03-02 11:12:13"), + ], + "IND_DURATION": [0.5417, 1.0], + "COD_ALARM": [12345, 98754], + "COD_ALARM_INT": [12345, 98754], + "DES_NAME": ["Alarm1", "Alarm2"], + "DES_TITLE": ["Description of alarm 1", "Description of alarm 2"], + } + ) + stoppages_df = pd.DataFrame( + { + "COD_ELEMENT": [0, 0], + "DAT_START": [ + pd.Timestamp("2022-01-01 00:00:00"), + pd.Timestamp("2022-03-01 11:12:13"), + ], + "DAT_END": [ + pd.Timestamp("2022-01-08 11:07:17"), + pd.Timestamp("2022-03-01 17:00:13"), + ], + "DES_WO_NAME": ["stoppage name 1", "stoppage name 2"], + "DES_COMMENTS": ["description of stoppage 1", "description of stoppage 2"], + "COD_WO": [12345, 67890], + "IND_DURATION": [7.4642, 0.2417], + "IND_LOST_GEN": [45678.0, 123.0], + "COD_ALARM": [12345, 12345], + "COD_CAUSE": [32, 48], + "COD_INCIDENCE": [987654, 123450], + "COD_ORIGIN": [6, 23], + "COD_STATUS": ["STOP", "PAUSE"], + "COD_CODE": ["ABC", "XYZ"], + "DES_DESCRIPTION": ["Description 1", "Description 2"], + } + ) + notifications_df = pd.DataFrame( + { + "COD_ELEMENT": [0, 0], + "COD_ORDER": [12345, 67890], + "IND_QUANTITY": [1, -20], + "COD_MATERIAL_SAP": [36052411, 67890], + "DAT_POSTING": [ + pd.Timestamp("2022-01-01 00:00:00"), + pd.Timestamp("2022-03-01 00:00:00"), + ], + "COD_MAT_DOC": [77889900, 12345690], + "DES_MEDIUM": [ + "Description of notification 1", + "Description of notification 2", + ], + "COD_NOTIF": [567890123, 32109877], + "DAT_MALF_START": [ + pd.Timestamp("2021-12-25 18:07:10"), + pd.Timestamp("2022-02-28 06:04:00"), + ], + "DAT_MALF_END": [ + pd.Timestamp("2022-01-08 11:07:17"), + pd.Timestamp("2022-03-01 17:00:13"), + ], + "IND_BREAKDOWN_DUR": [14.1378, 2.4792], + "FUNCT_LOC_DES": ["location description 1", "location description 2"], + "COD_ALARM": [12345, 12345], + "DES_ALARM": ["Alarm description", "Alarm description"], + } + ) + work_orders_df = pd.DataFrame( + { + "COD_ELEMENT": [0, 0], + "COD_ORDER": [12345, 67890], + "DAT_BASIC_START": [ + pd.Timestamp("2022-01-01 00:00:00"), + pd.Timestamp("2022-03-01 00:00:00"), + ], + "DAT_BASIC_END": [ + pd.Timestamp("2022-01-09 00:00:00"), + pd.Timestamp("2022-03-02 00:00:00"), + ], + "COD_EQUIPMENT": [98765, 98765], + "COD_MAINT_PLANT": ["ABC", "ABC"], + "COD_MAINT_ACT_TYPE": ["XYZ", "XYZ"], + "COD_CREATED_BY": ["A1234", "B6789"], + "COD_ORDER_TYPE": ["A", "B"], + "DAT_REFERENCE": [ + pd.Timestamp("2022-01-01 00:00:00"), + pd.Timestamp("2022-03-01 00:00:00"), + ], + "DAT_CREATED_ON": [ + pd.Timestamp("2022-03-01 00:00:00"), + pd.Timestamp("2022-04-18 00:00:00"), + ], + "DAT_VALID_END": [pd.NaT, pd.NaT], + "DAT_VALID_START": [pd.NaT, pd.NaT], + "COD_SYSTEM_STAT": ["ABC XYZ", "LMN OPQ"], + "DES_LONG": ["description of work order", "description of work order"], + "COD_FUNCT_LOC": ["!12345", "?09876"], + "COD_NOTIF_OBJ": ["00112233", "00998877"], + "COD_MAINT_ITEM": ["", "019283"], + "DES_MEDIUM": ["short description", "short description"], + "DES_FUNCT_LOC": ["XYZ1234", "ABC9876"], + } + ) + turbines_df = pd.DataFrame( + { + "COD_ELEMENT": [0], + "TURBINE_PI_ID": ["TA00"], + "TURBINE_LOCAL_ID": ["A0"], + "TURBINE_SAP_COD": ["LOC000"], + "DES_CORE_ELEMENT": ["T00"], + "SITE": ["LOCATION"], + "DES_CORE_PLANT": ["LOC"], + "COD_PLANT_SAP": ["ABC"], + "PI_COLLECTOR_SITE_NAME": ["LOC0"], + "PI_LOCAL_SITE_NAME": ["LOC0"], + } + ) + pidata_df = pd.DataFrame( + { + "time": [ + pd.Timestamp("2022-01-02 13:21:01"), + pd.Timestamp("2022-03-08 13:21:01"), + ], + "COD_ELEMENT": [0, 0], + "val1": [9872.0, 559.0], + "val2": [10.0, -7.0], } + ) - return pd.Series(scores) + # obj.create_entityset( + # { + # "alarms": alarms_df, + # "stoppages": stoppages_df, + # "notifications": notifications_df, + # "work_orders": work_orders_df, + # "turbines": turbines_df, + # "pidata": pidata_df, + # }, + # "pidata", + # ) - def save(self, path: str): - """Save this object using pickle. + # obj.set_entityset(entityset_path = + # "/Users/raymondpan/zephyr/Zephyr-repo/brake_pad_es", es_type = 'scada') - Args: - path (str): - Path to the file where the serialization of - this object will be stored. - """ - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'wb') as pickle_file: - pickle.dump(self, pickle_file) + # obj.set_labeling_function(name="brake_pad_presence") - @classmethod - def load(cls, path: str): - """Load an Zephyr instance from a pickle file. + # obj.generate_label_times(labeling_fn="brake_pad_presence", + # num_samples=10, gap="20d") + # # print(obj.get_label_times()) - Args: - path (str): - Path to the file where the instance has been - previously serialized. + # obj.generate_feature_matrix_and_labels( + # target_dataframe_name="turbines", + # cutoff_time_in_index=True, + # agg_primitives=["count", "sum", "max"], + # verbose = True + # ) - Returns: - Orion + # print(obj.get_feature_matrix_and_labels) - Raises: - ValueError: - If the serialized object is not a Zephyr instance. - """ - with open(path, 'rb') as pickle_file: - zephyr = pickle.load(pickle_file) - if not isinstance(zephyr, cls): - raise ValueError('Serialized object is not a Zephyr instance') + # obj.generate_train_test_split() + # add_primitives_path( + # path="/Users/raymondpan/zephyr/Zephyr-repo/zephyr_ml/primitives/jsons" + # ) + # obj.set_and_fit_pipeline() - return zephyr + # obj.evaluate() diff --git a/zephyr_ml/core_prev.py b/zephyr_ml/core_prev.py new file mode 100644 index 0000000..ca16567 --- /dev/null +++ b/zephyr_ml/core_prev.py @@ -0,0 +1,285 @@ +"""Zephyr Core module. + +This module defines the Zephyr Class, which is responsible for the +model training and inference with the underlying MLBlocks pipelines. +""" + +import json +import logging +import os +import pickle +from functools import partial +from typing import List, Union + +import numpy as np +import pandas as pd +from mlblocks import MLPipeline +from sklearn import metrics + +LOGGER = logging.getLogger(__name__) + + +_REGRESSION_METRICS = { + "mae": metrics.mean_absolute_error, + "mse": metrics.mean_squared_error, + "r2": metrics.r2_score, +} + +_CLASSIFICATION_METRICS = { + "accuracy": metrics.accuracy_score, + "f1": metrics.f1_score, + "recall": metrics.recall_score, + "precision": metrics.precision_score, +} + +METRICS = _CLASSIFICATION_METRICS + + +class Zephyr: + """Zephyr Class. + + The Zephyr Class provides the main machine learning pipeline functionalities + of Zephyr and is responsible for the interaction with the underlying + MLBlocks pipelines. + + Args: + pipeline (str, dict or MLPipeline): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (dict): + Additional hyperparameters to set to the Pipeline. + """ + + DEFAULT_PIPELINE = "xgb_classifier" + + def _get_mlpipeline(self): + pipeline = self._pipeline + if isinstance(pipeline, str) and os.path.isfile(pipeline): + with open(pipeline) as json_file: + pipeline = json.load(json_file) + + mlpipeline = MLPipeline(pipeline) + if self._hyperparameters: + mlpipeline.set_hyperparameters(self._hyperparameters) + + return mlpipeline + + def __init__( + self, + pipeline: Union[str, dict, MLPipeline] = None, + hyperparameters: dict = None, + ): + self._pipeline = pipeline or self.DEFAULT_PIPELINE + self._hyperparameters = hyperparameters + self._mlpipeline = self._get_mlpipeline() + self._fitted = False + + def __eq__(self, other): + return ( + isinstance(other, self.__class__) + and self._pipeline == other._pipeline + and self._hyperparameters == other._hyperparameters + and self._fitted == other._fitted + ) + + def _get_outputs_spec(self, default=True): + outputs_spec = ["default"] if default else [] + + try: + visual_names = self._mlpipeline.get_output_names("visual") + outputs_spec.append("visual") + except ValueError: + visual_names = [] + + return outputs_spec, visual_names + + def fit( + self, + X: pd.DataFrame, + y: Union[pd.Series, np.ndarray], + visual: bool = False, + **kwargs + ): + """Fit the pipeline to the given data. + + Args: + X (DataFrame): + Input data, passed as a ``pandas.DataFrame`` containing + the feature matrix. + y (Series or ndarray): + Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` + containing the target values. + visual (bool): + If ``True``, capture the ``visual`` named output from the + ``MLPipeline`` and return it as an output. + """ + if not self._fitted: + self._mlpipeline = self._get_mlpipeline() + + if visual: + outputs_spec, visual_names = self._get_outputs_spec(False) + else: + outputs_spec = None + + outputs = self._mlpipeline.fit(X, y, output_=outputs_spec, **kwargs) + self._fitted = True + + if visual and outputs is not None: + return dict(zip(visual_names, outputs)) + + def predict(self, X: pd.DataFrame, visual: bool = False, **kwargs) -> pd.Series: + """Predict the pipeline to the given data. + + Args: + X (DataFrame): + + Input data, passed as a ``pandas.DataFrame`` containing + the feature matrix. + visual (bool): + If ``True``, capture the ``visual`` named output from the + ``MLPipeline`` and return it as an output. + + Returns: + Series or ndarray: + Predictions to the input data. + """ + if visual: + outputs_spec, visual_names = self._get_outputs_spec() + else: + outputs_spec = "default" + + outputs = self._mlpipeline.predict(X, output_=outputs_spec, **kwargs) + + if visual and visual_names: + prediction = outputs[0] + return prediction, dict(zip(visual_names, outputs[-len(visual_names):])) + + return outputs + + def fit_predict( + self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], **kwargs + ) -> pd.Series: + """Fit the pipeline to the data and then predict targets. + + This method is functionally equivalent to calling ``fit(X, y)`` + and later on ``predict(X)`` but with the difference that + here the ``MLPipeline`` is called only once, using its ``fit`` + method, and the output is directly captured without having + to execute the whole pipeline again during the ``predict`` phase. + + Args: + X (DataFrame): + Input data, passed as a ``pandas.DataFrame`` containing + the feature matrix. + y (Series or ndarray): + Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` + containing the target values. + + Returns: + Series or ndarray: + Predictions to the input data. + """ + if not self._fitted: + self._mlpipeline = self._get_mlpipeline() + + result = self._mlpipeline.fit(X, y, output_="default", **kwargs) + self._fitted = True + + return result + + def evaluate( + self, + X: pd.DataFrame, + y: Union[pd.Series, np.ndarray], + fit: bool = False, + train_X: pd.DataFrame = None, + train_y: Union[pd.Series, np.ndarray] = None, + metrics: List[str] = METRICS, + ) -> pd.Series: + """Evaluate the performance of the pipeline. + + Args: + X (DataFrame): + Input data, passed as a ``pandas.DataFrame`` containing + the feature matrix. + y (Series or ndarray): + Target data, passed as a ``pandas.Series`` or ``numpy.ndarray`` + containing the target values. + fit (bool): + Whether to fit the pipeline before evaluating it. + Defaults to ``False``. + train_X (DataFrame): + Training data, passed as a ``pandas.DataFrame`` containing + the feature matrix. + If not given, the pipeline is fitted on ``X``. + train_y (Series or ndarray): + Target data used for training, passed as a ``pandas.Series`` or + ``numpy.ndarray`` containing the target values. + metrics (list): + List of metrics to used passed as a list of strings. + If not given, it defaults to all the metrics. + + Returns: + Series: + ``pandas.Series`` containing one element for each + metric applied, with the metric name as index. + """ + if not fit: + method = self._mlpipeline.predict + else: + if not self._fitted: + mlpipeline = self._get_mlpipeline() + else: + mlpipeline = self._mlpipeline + + if train_X is not None and train_y is not None: + # fit first and then predict + mlpipeline.fit(train_X, train_y) + method = mlpipeline.predict + else: + # fit and predict at once + method = partial(mlpipeline.fit, y=y, output_="default") + + result = method(X) + + scores = {metric: METRICS[metric](y, result) for metric in metrics} + + return pd.Series(scores) + + def save(self, path: str): + """Save this object using pickle. + + Args: + path (str): + Path to the file where the serialization of + this object will be stored. + """ + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as pickle_file: + pickle.dump(self, pickle_file) + + @classmethod + def load(cls, path: str): + """Load an Zephyr instance from a pickle file. + + Args: + path (str): + Path to the file where the instance has been + previously serialized. + + Returns: + Orion + + Raises: + ValueError: + If the serialized object is not a Zephyr instance. + """ + with open(path, "rb") as pickle_file: + zephyr = pickle.load(pickle_file) + if not isinstance(zephyr, cls): + raise ValueError("Serialized object is not a Zephyr instance") + + return zephyr diff --git a/zephyr_ml/entityset.py b/zephyr_ml/entityset.py index 3d14c66..514c325 100644 --- a/zephyr_ml/entityset.py +++ b/zephyr_ml/entityset.py @@ -5,170 +5,192 @@ from zephyr_ml.metadata import get_mapped_kwargs -def _create_entityset(entities, es_type, es_kwargs): - # filter out stated logical types for missing columns - for entity, df in entities.items(): - es_kwargs[entity]['logical_types'] = { - col: t for col, t in es_kwargs[entity]['logical_types'].items() - if col in df.columns - } - - turbines_index = es_kwargs['turbines']['index'] - work_orders_index = es_kwargs['work_orders']['index'] - - relationships = [ - ('turbines', turbines_index, 'alarms', turbines_index), - ('turbines', turbines_index, 'stoppages', turbines_index), - ('turbines', turbines_index, 'work_orders', turbines_index), - ('turbines', turbines_index, es_type, turbines_index), - ('work_orders', work_orders_index, 'notifications', work_orders_index) - ] - - es = ft.EntitySet() - - for name, df in entities.items(): - es.add_dataframe( - dataframe_name=name, - dataframe=df, - **es_kwargs[name] - ) - - for relationship in relationships: - parent_df, parent_column, child_df, child_column = relationship - es.add_relationship(parent_df, parent_column, child_df, child_column) - - return es - - -def create_pidata_entityset(dfs, new_kwargs_mapping=None): - '''Generate an entityset for PI data datasets - - Args: - data_paths (dict): Dictionary mapping entity names ('alarms', 'notifications', - 'stoppages', 'work_orders', 'pidata', 'turbines') to the pandas dataframe for - that entity. - **kwargs: Updated keyword arguments to be used during entityset creation - ''' - entity_kwargs = get_mapped_kwargs('pidata', new_kwargs_mapping) - _validate_data(dfs, 'pidata', entity_kwargs) - - es = _create_entityset(dfs, 'pidata', entity_kwargs) - es.id = 'PI data' - - return es - - -def create_scada_entityset(dfs, new_kwargs_mapping=None): - '''Generate an entityset for SCADA data datasets - - Args: - data_paths (dict): Dictionary mapping entity names ('alarms', 'notifications', - 'stoppages', 'work_orders', 'scada', 'turbines') to the pandas dataframe for - that entity. - ''' - entity_kwargs = get_mapped_kwargs('scada', new_kwargs_mapping) - _validate_data(dfs, 'scada', entity_kwargs) - - es = _create_entityset(dfs, 'scada', entity_kwargs) - es.id = 'SCADA data' - - return es - - -def create_vibrations_entityset(dfs, new_kwargs_mapping=None): - '''Generate an entityset for Vibrations data datasets - - Args: - data_paths (dict): Dictionary mapping entity names ('alarms', 'notifications', - 'stoppages', 'work_orders', 'vibrations', 'turbines') to the pandas - dataframe for that entity. Optionally 'pidata' and 'scada' can be included. - ''' - entities = ['vibrations'] - - pidata_kwargs, scada_kwargs = {}, {} - if 'pidata' in dfs: - pidata_kwargs = get_mapped_kwargs('pidata', new_kwargs_mapping) - entities.append('pidata') - if 'scada' in dfs: - pidata_kwargs = get_mapped_kwargs('scada', new_kwargs_mapping) - entities.append('scada') - - entity_kwargs = { - **pidata_kwargs, - **scada_kwargs, - **get_mapped_kwargs('vibrations', new_kwargs_mapping), - } - _validate_data(dfs, entities, entity_kwargs) - - es = _create_entityset(dfs, 'vibrations', entity_kwargs) - es.id = 'Vibrations data' - - return es - - def _validate_data(dfs, es_type, es_kwargs): - '''Validate data by checking for required columns in each entity - ''' + """Validate data by checking for required columns in each entity""" if not isinstance(es_type, list): es_type = [es_type] - entities = set(chain( - ['alarms', 'stoppages', 'work_orders', 'notifications', 'turbines', *es_type] - )) + entities = set( + chain( + [ + "alarms", + "stoppages", + "work_orders", + "notifications", + "turbines", + *es_type, + ] + ) + ) if set(dfs.keys()) != entities: missing = entities.difference(set(dfs.keys())) extra = set(dfs.keys()).difference(entities) msg = [] if missing: - msg.append('Missing dataframes for entities {}.'.format(', '.join(missing))) + msg.append("Missing dataframes for entities {}.".format( + ", ".join(missing))) if extra: - msg.append('Unrecognized entities {} included in dfs.'.format(', '.join(extra))) + msg.append( + "Unrecognized entities {} included in dfs.".format( + ", ".join(extra)) + ) - raise ValueError(' '.join(msg)) + raise ValueError(" ".join(msg)) - turbines_index = es_kwargs['turbines']['index'] - work_orders_index = es_kwargs['work_orders']['index'] + turbines_index = es_kwargs["turbines"]["index"] + work_orders_index = es_kwargs["work_orders"]["index"] - if work_orders_index not in dfs['work_orders'].columns: + if work_orders_index not in dfs["work_orders"].columns: raise ValueError( - 'Expected index column "{}" missing from work_orders entity'.format(work_orders_index)) + 'Expected index column "{}" missing from work_orders entity'.format( + work_orders_index + ) + ) - if work_orders_index not in dfs['notifications'].columns: + if work_orders_index not in dfs["notifications"].columns: raise ValueError( - 'Expected column "{}" missing from notifications entity'.format(work_orders_index)) + 'Expected column "{}" missing from notifications entity'.format( + work_orders_index + ) + ) - if not dfs['work_orders'][work_orders_index].is_unique: - raise ValueError('Expected index column "{}" of work_orders entity is not ' - 'unique'.format(work_orders_index)) + if not dfs["work_orders"][work_orders_index].is_unique: + raise ValueError( + 'Expected index column "{}" of work_orders entity is not ' + "unique".format(work_orders_index) + ) - if turbines_index not in dfs['turbines'].columns: + if turbines_index not in dfs["turbines"].columns: raise ValueError( - 'Expected index column "{}" missing from turbines entity'.format(turbines_index)) + 'Expected index column "{}" missing from turbines entity'.format( + turbines_index + ) + ) - if not dfs['turbines'][turbines_index].is_unique: + if not dfs["turbines"][turbines_index].is_unique: raise ValueError( - 'Expected index column "{}" of turbines entity is not unique.'.format(turbines_index)) + 'Expected index column "{}" of turbines entity is not unique.'.format( + turbines_index + ) + ) for entity, df in dfs.items(): if turbines_index not in df.columns: raise ValueError( 'Turbines index column "{}" missing from data for {} entity'.format( - turbines_index, entity)) + turbines_index, entity + ) + ) - time_index = es_kwargs[entity].get('time_index', False) + time_index = es_kwargs[entity].get("time_index", False) if time_index and time_index not in df.columns: raise ValueError( 'Missing time index column "{}" from {} entity'.format( - time_index, entity)) + time_index, entity + ) + ) - secondary_time_indices = es_kwargs[entity].get('secondary_time_index', {}) + secondary_time_indices = es_kwargs[entity].get( + "secondary_time_index", {}) for time_index, cols in secondary_time_indices.items(): if time_index not in df.columns: raise ValueError( 'Secondary time index "{}" missing from {} entity'.format( - time_index, entity)) + time_index, entity + ) + ) for col in cols: if col not in df.columns: - raise ValueError(('Column "{}" associated with secondary time index "{}" ' - 'missing from {} entity').format(col, time_index, entity)) + raise ValueError( + ( + 'Column "{}" associated with secondary time index "{}" ' + "missing from {} entity" + ).format(col, time_index, entity) + ) + + +def validate_scada_data(dfs, new_kwargs_mapping=None): + """ + SCADA data is signal data from the Original Equipment Manufacturer Supervisory Control + And Data Acquisition (OEM-SCADA) system, a signal data source. + """ + entity_kwargs = get_mapped_kwargs("scada", new_kwargs_mapping) + _validate_data(dfs, "scada", entity_kwargs) + return entity_kwargs + + +def validate_pidata_data(dfs, new_kwargs_mapping=None): + """ + PI data is signal data from the operator's historical Plant Information (PI) system. + """ + entity_kwargs = get_mapped_kwargs("pidata", new_kwargs_mapping) + _validate_data(dfs, "pidata", entity_kwargs) + return entity_kwargs + + +def validate_vibrations_data(dfs, new_kwargs_mapping=None): + """ + Vibrations data is vibrations data collected on Planetary gearboxes in turbines. + """ + entities = ["vibrations"] + + pidata_kwargs, scada_kwargs = {}, {} + if "pidata" in dfs: + pidata_kwargs = get_mapped_kwargs("pidata", new_kwargs_mapping) + entities.append("pidata") + if "scada" in dfs: + scada_kwargs = get_mapped_kwargs("scada", new_kwargs_mapping) + entities.append("scada") + + entity_kwargs = { + **pidata_kwargs, + **scada_kwargs, + **get_mapped_kwargs("vibrations", new_kwargs_mapping), + } + _validate_data(dfs, entities, entity_kwargs) + return entity_kwargs + + +VALIDATE_DATA_FUNCTIONS = { + "scada": validate_scada_data, + "pidata": validate_pidata_data, + "vibrations": validate_vibrations_data, +} + + +def _create_entityset(entities, es_type, new_kwargs_mapping=None): + + validate_func = VALIDATE_DATA_FUNCTIONS[es_type] + es_kwargs = validate_func(entities, new_kwargs_mapping) + + # filter out stated logical types for missing columns + for entity, df in entities.items(): + es_kwargs[entity]["logical_types"] = { + col: t + for col, t in es_kwargs[entity]["logical_types"].items() + if col in df.columns + } + + turbines_index = es_kwargs["turbines"]["index"] + work_orders_index = es_kwargs["work_orders"]["index"] + + relationships = [ + ("turbines", turbines_index, "alarms", turbines_index), + ("turbines", turbines_index, "stoppages", turbines_index), + ("turbines", turbines_index, "work_orders", turbines_index), + ("turbines", turbines_index, es_type, turbines_index), + ("work_orders", work_orders_index, "notifications", work_orders_index), + ] + + es = ft.EntitySet() + es.id = es_type + + for name, df in entities.items(): + es.add_dataframe(dataframe_name=name, dataframe=df, **es_kwargs[name]) + + for relationship in relationships: + parent_df, parent_column, child_df, child_column = relationship + es.add_relationship(parent_df, parent_column, child_df, child_column) + + return es diff --git a/zephyr_ml/feature_engineering.py b/zephyr_ml/feature_engineering.py index a9bc3fd..3310e32 100644 --- a/zephyr_ml/feature_engineering.py +++ b/zephyr_ml/feature_engineering.py @@ -2,7 +2,7 @@ def process_signals(es, signal_dataframe_name, signal_column, transformations, aggregations, - window_size, replace_dataframe=False, **kwargs): + window_size=None, replace_dataframe=False, **kwargs): ''' Process signals using SigPro. @@ -38,7 +38,8 @@ def process_signals(es, signal_dataframe_name, signal_column, transformations, a old_relationship = relationship groupby_index = relationship.child_column.name - pipeline = SigPro(transformations, aggregations, values_column_name=signal_column, **kwargs) + pipeline = SigPro(transformations, aggregations, + values_column_name=signal_column, **kwargs) processed_df, f_cols = pipeline.process_signal( signal_df, diff --git a/zephyr_ml/labeling/__init__.py b/zephyr_ml/labeling/__init__.py index 018f408..60cb5d7 100644 --- a/zephyr_ml/labeling/__init__.py +++ b/zephyr_ml/labeling/__init__.py @@ -7,8 +7,9 @@ brake_pad_presence, converter_replacement_presence, gearbox_replace_presence, - total_power_loss + total_power_loss, ] + UTIL_FUNCTIONS = [ utils.aggregate_by_column, utils.categorical_presence, @@ -23,8 +24,17 @@ def get_labeling_functions(): functions = {} for function in LABELING_FUNCTIONS: name = function.__name__ - functions[name] = function.__doc__.split('\n')[0] + functions[name] = {"obj": function, "desc": function.__doc__.split("\n")[ + 0]} + + return functions + +def get_labeling_functions_map(): + functions = {} + for function in LABELING_FUNCTIONS: + name = function.__name__ + functions[name] = function return functions @@ -32,7 +42,7 @@ def get_helper_functions(): functions = {} for function in UTIL_FUNCTIONS: name = function.__name__ - functions[name] = function.__doc__.split('\n')[0] + functions[name] = function.__doc__.split("\n")[0] return functions diff --git a/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py b/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py index bcaf7ee..fe32c33 100644 --- a/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py +++ b/zephyr_ml/labeling/labeling_functions/brake_pad_presence.py @@ -36,7 +36,8 @@ def label(ds, **kwargs): a = ds[comments] a = a.fillna('') a = a.str.lower() - f = any(a.apply(lambda d: ('brake' in d) and ('pad' in d) and ('yaw' not in d))) + f = any(a.apply(lambda d: ('brake' in d) + and ('pad' in d) and ('yaw' not in d))) return f meta = { diff --git a/zephyr_ml/labeling/labeling_functions/planet_bearing.py b/zephyr_ml/labeling/labeling_functions/planet_bearing.py index 283e5cd..36a5412 100644 --- a/zephyr_ml/labeling/labeling_functions/planet_bearing.py +++ b/zephyr_ml/labeling/labeling_functions/planet_bearing.py @@ -35,7 +35,8 @@ def gearbox_replace_presence(es, column_map={}): def label(ds, **kwargs): label_strings = ['Gearbox replace*', 'Gearbox exchange'] comments_lower = ds[comments].fillna('').str.lower() - f = any(comments_lower.str.contains('|'.join(label_strings), case=False)) + f = any(comments_lower.str.contains( + '|'.join(label_strings), case=False)) return f meta = { diff --git a/zephyr_ml/labeling/utils.py b/zephyr_ml/labeling/utils.py index 23bf0c4..a326e9c 100644 --- a/zephyr_ml/labeling/utils.py +++ b/zephyr_ml/labeling/utils.py @@ -144,7 +144,8 @@ def categorical_function(df): """ return int(df[categorical_column].isin([value]).sum() > 0) - categorical_function.__doc__ = categorical_function.__doc__.format(categorical_column, value) + categorical_function.__doc__ = categorical_function.__doc__.format( + categorical_column, value) return categorical_function @@ -177,7 +178,8 @@ def keyword_function(df): return int(mask.sum() != 0) - keyword_function.__doc__ = keyword_function.__doc__.format(keyword, columns) + keyword_function.__doc__ = keyword_function.__doc__.format( + keyword, columns) return keyword_function @@ -203,7 +205,8 @@ def numerical_function(df): series = df[numerical_column] return int(len(series[series > threshold]) > 0) - numerical_function.__doc__ = numerical_function.__doc__.format(numerical_column, threshold) + numerical_function.__doc__ = numerical_function.__doc__.format( + numerical_column, threshold) return numerical_function @@ -228,5 +231,6 @@ def duration_function(df): """ return ((df[end_time] - df[start_time]).dt.total_seconds()).sum() - duration_function.__doc__ = duration_function.__doc__.format(start_time, end_time) + duration_function.__doc__ = duration_function.__doc__.format( + start_time, end_time) return duration_function diff --git a/zephyr_ml/metadata.py b/zephyr_ml/metadata.py index 30aa97e..9482d8e 100644 --- a/zephyr_ml/metadata.py +++ b/zephyr_ml/metadata.py @@ -1,202 +1,209 @@ +import copy + # Default EntitySet keyword arguments for entities DEFAULT_ES_KWARGS = { - 'alarms': { - 'index': '_index', - 'make_index': True, - 'time_index': 'DAT_START', - 'secondary_time_index': {'DAT_END': ['IND_DURATION']}, - 'logical_types': { - 'COD_ELEMENT': 'categorical', # turbine id - 'DAT_START': 'datetime', # start - 'DAT_END': 'datetime', # end - 'IND_DURATION': 'double', # duration - 'COD_ALARM': 'categorical', # alarm code - 'COD_ALARM_INT': 'categorical', # international alarm code - 'DES_NAME': 'categorical', # alarm name - 'DES_TITLE': 'categorical', # alarm description - 'COD_STATUS': 'categorical' # status code - } + "alarms": { + "index": "_index", + "make_index": True, + "time_index": "DAT_START", + "secondary_time_index": {"DAT_END": ["IND_DURATION"]}, + "logical_types": { + "COD_ELEMENT": "categorical", # turbine id + "DAT_START": "datetime", # start + "DAT_END": "datetime", # end + "IND_DURATION": "double", # duration + "COD_ALARM": "categorical", # alarm code + "COD_ALARM_INT": "categorical", # international alarm code + "DES_NAME": "categorical", # alarm name + "DES_TITLE": "categorical", # alarm description + "COD_STATUS": "categorical", # status code + }, + }, + "stoppages": { + "index": "_index", + "make_index": True, + "time_index": "DAT_START", + "secondary_time_index": {"DAT_END": ["IND_DURATION", "IND_LOST_GEN"]}, + "logical_types": { + "COD_ELEMENT": "categorical", # turbine id + "DAT_START": "datetime", # start + "DAT_END": "datetime", # end + "DES_WO_NAME": "natural_language", # work order name + "DES_COMMENTS": "natural_language", # work order comments + "COD_WO": "integer_nullable", # stoppage code + "IND_DURATION": "double", # duration + "IND_LOST_GEN": "double", # generation loss + "COD_ALARM": "categorical", # alarm code + "COD_CAUSE": "categorical", # stoppage cause + "COD_INCIDENCE": "categorical", # incidence code + "COD_ORIGIN": "categorical", # origin code + "DESC_CLASS": "categorical", # ???? + "COD_STATUS": "categorical", # status code + "COD_CODE": "categorical", # stoppage code + "DES_DESCRIPTION": "natural_language", # stoppage description + "DES_TECH_NAME": "categorical", # turbine technology + }, }, - 'stoppages': { - 'index': '_index', - 'make_index': True, - 'time_index': 'DAT_START', - 'secondary_time_index': {'DAT_END': ['IND_DURATION', 'IND_LOST_GEN']}, - 'logical_types': { - 'COD_ELEMENT': 'categorical', # turbine id - 'DAT_START': 'datetime', # start - 'DAT_END': 'datetime', # end - 'DES_WO_NAME': 'natural_language', # work order name - 'DES_COMMENTS': 'natural_language', # work order comments - 'COD_WO': 'integer_nullable', # stoppage code - 'IND_DURATION': 'double', # duration - 'IND_LOST_GEN': 'double', # generation loss - 'COD_ALARM': 'categorical', # alarm code - 'COD_CAUSE': 'categorical', # stoppage cause - 'COD_INCIDENCE': 'categorical', # incidence code - 'COD_ORIGIN': 'categorical', # origin code - 'DESC_CLASS': 'categorical', # ???? - 'COD_STATUS': 'categorical', # status code - 'COD_CODE': 'categorical', # stoppage code - 'DES_DESCRIPTION': 'natural_language', # stoppage description - 'DES_TECH_NAME': 'categorical' # turbine technology - } + "notifications": { + "index": "_index", + "make_index": True, + "time_index": "DAT_POSTING", + "secondary_time_index": {"DAT_MALF_END": ["IND_BREAKDOWN_DUR"]}, + "logical_types": { + "COD_ELEMENT": "categorical", # turbine id + "COD_ORDER": "categorical", + "IND_QUANTITY": "double", + "COD_MATERIAL_SAP": "categorical", + "DAT_POSTING": "datetime", + "COD_MAT_DOC": "categorical", + "DES_MEDIUM": "categorical", + "COD_NOTIF": "categorical", + "DAT_MALF_START": "datetime", + "DAT_MALF_END": "datetime", + "IND_BREAKDOWN_DUR": "double", + "FUNCT_LOC_DES": "categorical", + "COD_ALARM": "categorical", + "DES_ALARM": "categorical", + }, }, - 'notifications': { - 'index': '_index', - 'make_index': True, - 'time_index': 'DAT_POSTING', - 'secondary_time_index': {'DAT_MALF_END': ['IND_BREAKDOWN_DUR']}, - 'logical_types': { - 'COD_ELEMENT': 'categorical', # turbine id - 'COD_ORDER': 'categorical', - 'IND_QUANTITY': 'double', - 'COD_MATERIAL_SAP': 'categorical', - 'DAT_POSTING': 'datetime', - 'COD_MAT_DOC': 'categorical', - 'DES_MEDIUM': 'categorical', - 'COD_NOTIF': 'categorical', - 'DAT_MALF_START': 'datetime', - 'DAT_MALF_END': 'datetime', - 'IND_BREAKDOWN_DUR': 'double', - 'FUNCT_LOC_DES': 'categorical', - 'COD_ALARM': 'categorical', - 'DES_ALARM': 'categorical' - } + "work_orders": { + "index": "COD_ORDER", + "time_index": "DAT_BASIC_START", + "secondary_time_index": {"DAT_VALID_END": []}, + "logical_types": { + "COD_ELEMENT": "categorical", + "COD_ORDER": "categorical", + "DAT_BASIC_START": "datetime", + "DAT_BASIC_END": "datetime", + "COD_EQUIPMENT": "categorical", + "COD_MAINT_PLANT": "categorical", + "COD_MAINT_ACT_TYPE": "categorical", + "COD_CREATED_BY": "categorical", + "COD_ORDER_TYPE": "categorical", + "DAT_REFERENCE": "datetime", + "DAT_CREATED_ON": "datetime", + "DAT_VALID_END": "datetime", + "DAT_VALID_START": "datetime", + "COD_SYSTEM_STAT": "categorical", + "DES_LONG": "natural_language", + "COD_FUNCT_LOC": "categorical", + "COD_NOTIF_OBJ": "categorical", + "COD_MAINT_ITEM": "categorical", + "DES_MEDIUM": "natural_language", + "DES_FUNCT_LOC": "categorical", + }, }, - 'work_orders': { - 'index': 'COD_ORDER', - 'time_index': 'DAT_BASIC_START', - 'secondary_time_index': {'DAT_VALID_END': []}, - 'logical_types': { - 'COD_ELEMENT': 'categorical', - 'COD_ORDER': 'categorical', - 'DAT_BASIC_START': 'datetime', - 'DAT_BASIC_END': 'datetime', - 'COD_EQUIPMENT': 'categorical', - 'COD_MAINT_PLANT': 'categorical', - 'COD_MAINT_ACT_TYPE': 'categorical', - 'COD_CREATED_BY': 'categorical', - 'COD_ORDER_TYPE': 'categorical', - 'DAT_REFERENCE': 'datetime', - 'DAT_CREATED_ON': 'datetime', - 'DAT_VALID_END': 'datetime', - 'DAT_VALID_START': 'datetime', - 'COD_SYSTEM_STAT': 'categorical', - 'DES_LONG': 'natural_language', - 'COD_FUNCT_LOC': 'categorical', - 'COD_NOTIF_OBJ': 'categorical', - 'COD_MAINT_ITEM': 'categorical', - 'DES_MEDIUM': 'natural_language', - 'DES_FUNCT_LOC': 'categorical' - } + "turbines": { + "index": "COD_ELEMENT", + "logical_types": { + "COD_ELEMENT": "categorical", + "TURBINE_PI_ID": "categorical", + "TURBINE_LOCAL_ID": "categorical", + "TURBINE_SAP_COD": "categorical", + "DES_CORE_ELEMENT": "categorical", + "SITE": "categorical", + "DES_CORE_PLANT": "categorical", + "COD_PLANT_SAP": "categorical", + "PI_COLLECTOR_SITE_NAME": "categorical", + "PI_LOCAL_SITE_NAME": "categorical", + }, }, - 'turbines': { - 'index': 'COD_ELEMENT', - 'logical_types': { - 'COD_ELEMENT': 'categorical', - 'TURBINE_PI_ID': 'categorical', - 'TURBINE_LOCAL_ID': 'categorical', - 'TURBINE_SAP_COD': 'categorical', - 'DES_CORE_ELEMENT': 'categorical', - 'SITE': 'categorical', - 'DES_CORE_PLANT': 'categorical', - 'COD_PLANT_SAP': 'categorical', - 'PI_COLLECTOR_SITE_NAME': 'categorical', - 'PI_LOCAL_SITE_NAME': 'categorical' - } - } } DEFAULT_ES_TYPE_KWARGS = { - 'pidata': { - 'index': '_index', - 'make_index': True, - 'time_index': 'time', - 'logical_types': { - 'time': 'datetime', - 'COD_ELEMENT': 'categorical' - } + "pidata": { + "index": "_index", + "make_index": True, + "time_index": "time", + "logical_types": {"time": "datetime", "COD_ELEMENT": "categorical"}, }, - 'scada': { - 'index': '_index', - 'make_index': True, - 'time_index': 'TIMESTAMP', - 'logical_types': { - 'TIMESTAMP': 'datetime', - 'COD_ELEMENT': 'categorical' - } + "scada": { + "index": "_index", + "make_index": True, + "time_index": "TIMESTAMP", + "logical_types": {"TIMESTAMP": "datetime", "COD_ELEMENT": "categorical"}, + }, + "vibrations": { + "index": "_index", + "make_index": True, + "time_index": "timestamp", + "logical_types": { + "COD_ELEMENT": "categorical", + "turbine_id": "categorical", + "signal_id": "categorical", + "timestamp": "datetime", + "sensorName": "categorical", + "sensorType": "categorical", + "sensorSerial": "integer_nullable", + "siteName": "categorical", + "turbineName": "categorical", + "turbineSerial": "integer_nullable", + "configurationName": "natural_language", + "softwareVersion": "categorical", + "rpm": "double", + "rpmStatus": "natural_language", + "duration": "natural_language", + "condition": "categorical", + "maskTime": "datetime", + "Mask Status": "natural_language", + "System Serial": "categorical", + "WPS-ActivePower-Average": "double", + "WPS-ActivePower-Minimum": "double", + "WPS-ActivePower-Maximum": "double", + "WPS-ActivePower-Deviation": "double", + "WPS-ActivePower-StartTime": "datetime", + "WPS-ActivePower-StopTime": "datetime", + "WPS-ActivePower-Counts": "natural_language", + "Measured RPM": "double", + "WPS-ActivePower": "double", + "WPS-Gearoiltemperature": "double", + "WPS-GeneratorRPM": "double", + "WPS-PitchReference": "double", + "WPS-RotorRPM": "double", + "WPS-Windspeed": "double", + "WPS-YawAngle": "double", + "overload warning": "categorical", + "bias warning": "categorical", + "bias voltage": "double", + "xValueOffset": "double", + "xValueDelta": "double", + "xValueUnit": "categorical", + "yValueUnit": "categorical", + "TotalCount-RPM0": "double", + "TotalCount-RPM1": "double", + "TotalCount-RPM2": "double", + "TotalCount-RPM3": "double", + }, }, - 'vibrations': { - 'index': '_index', - 'make_index': True, - 'time_index': 'timestamp', - 'logical_types': { - 'COD_ELEMENT': 'categorical', - 'turbine_id': 'categorical', - 'signal_id': 'categorical', - 'timestamp': 'datetime', - 'sensorName': 'categorical', - 'sensorType': 'categorical', - 'sensorSerial': 'integer_nullable', - 'siteName': 'categorical', - 'turbineName': 'categorical', - 'turbineSerial': 'integer_nullable', - 'configurationName': 'natural_language', - 'softwareVersion': 'categorical', - 'rpm': 'double', - 'rpmStatus': 'natural_language', - 'duration': 'natural_language', - 'condition': 'categorical', - 'maskTime': 'datetime', - 'Mask Status': 'natural_language', - 'System Serial': 'categorical', - 'WPS-ActivePower-Average': 'double', - 'WPS-ActivePower-Minimum': 'double', - 'WPS-ActivePower-Maximum': 'double', - 'WPS-ActivePower-Deviation': 'double', - 'WPS-ActivePower-StartTime': 'datetime', - 'WPS-ActivePower-StopTime': 'datetime', - 'WPS-ActivePower-Counts': 'natural_language', - 'Measured RPM': 'double', - 'WPS-ActivePower': 'double', - 'WPS-Gearoiltemperature': 'double', - 'WPS-GeneratorRPM': 'double', - 'WPS-PitchReference': 'double', - 'WPS-RotorRPM': 'double', - 'WPS-Windspeed': 'double', - 'WPS-YawAngle': 'double', - 'overload warning': 'categorical', - 'bias warning': 'categorical', - 'bias voltage': 'double', - 'xValueOffset': 'double', - 'xValueDelta': 'double', - 'xValueUnit': 'categorical', - 'yValueUnit': 'categorical', - 'TotalCount-RPM0': 'double', - 'TotalCount-RPM1': 'double', - 'TotalCount-RPM2': 'double', - 'TotalCount-RPM3': 'double' - } - } } def get_mapped_kwargs(es_type, new_kwargs=None): if es_type not in DEFAULT_ES_TYPE_KWARGS.keys(): - raise ValueError('Unrecognized es_type argument: {}'.format(es_type)) + raise ValueError("Unrecognized es_type argument: {}".format(es_type)) mapped_kwargs = DEFAULT_ES_KWARGS.copy() mapped_kwargs.update({es_type: DEFAULT_ES_TYPE_KWARGS[es_type]}) if new_kwargs is not None: if not isinstance(new_kwargs, dict): - raise ValueError('new_kwargs must be dictionary mapping entity name to dictionary ' - 'with updated keyword arguments for EntitySet creation.') + raise ValueError( + "new_kwargs must be dictionary mapping entity name to dictionary " + "with updated keyword arguments for EntitySet creation." + ) for entity in new_kwargs: if entity not in mapped_kwargs: - raise ValueError('Unrecognized entity "{}" found in new keyword argument ' - 'mapping.'.format(entity)) + raise ValueError( + 'Unrecognized entity "{}" found in new keyword argument ' + "mapping.".format(entity) + ) mapped_kwargs[entity].update(new_kwargs[entity]) - return mapped_kwargs + + +def get_default_es_type_kwargs(): + return copy.deepcopy(DEFAULT_ES_TYPE_KWARGS) + + +def get_es_types(): + return DEFAULT_ES_TYPE_KWARGS.keys() diff --git a/zephyr_ml/pipelines/xgb_classifier.json b/zephyr_ml/pipelines/xgb_classifier.json index 26059dc..28fd0d5 100644 --- a/zephyr_ml/pipelines/xgb_classifier.json +++ b/zephyr_ml/pipelines/xgb_classifier.json @@ -1,39 +1,34 @@ { - "metadata": { - "name": "xgb", - "data_type": "single_table", - "task_type": "classification" - }, - "primitives": [ - "xgboost.XGBClassifier", - "zephyr_ml.primitives.postprocessing.FindThreshold" - ], - "input_names": { - "zephyr_ml.primitives.postprocessing.FindThreshold#1": { - "y_true": "y" - } - }, - "output_names": { - "xgboost.XGBClassifier#1": { - "y": "y_pred" - } - }, - "outputs": { - "default": [ - { - "name": "y", - "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.y" - } - ], - "visual": [ - { - "name": "threshold", - "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.threshold" - }, - { - "name": "scores", - "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.scores" - } - ] + "metadata": { + "name": "xgb", + "data_type": "single_table", + "task_type": "classification" + }, + "primitives": [ + "xgboost.XGBClassifier", + "zephyr_ml.primitives.postprocessing.FindThreshold" + ], + "input_names": { + "zephyr_ml.primitives.postprocessing.FindThreshold#1": { + "y_true": "y" } + }, + "outputs": { + "default": [ + { + "name": "y_pred", + "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.y_pred" + } + ], + "visual": [ + { + "name": "threshold", + "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.threshold" + }, + { + "name": "scores", + "variable": "zephyr_ml.primitives.postprocessing.FindThreshold#1.scores" + } + ] + } } diff --git a/zephyr_ml/primitives/jsons/sklearn.ensemble.GradientBoostingClassifier.json b/zephyr_ml/primitives/jsons/sklearn.ensemble.GradientBoostingClassifier.json new file mode 100644 index 0000000..02a647f --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.ensemble.GradientBoostingClassifier.json @@ -0,0 +1,187 @@ +{ + "name": "sklearn.ensemble.GradientBoostingClassifier", + "contributors": [ + "Carles Sala ", + "Plamen Valentinov " + ], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html", + "description": "Scikit-learn GradientBoostingClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.GradientBoostingClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict_proba", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "warm_start": { + "type": "bool", + "default": false + }, + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "presort": { + "type": "bool", + "default": false + } + }, + "tunable": { + "loss": { + "type": "str", + "default": "deviance", + "values": [ + "deviance", + "exponential" + ] + }, + "learning_rate": { + "type": "float", + "default": 0.1, + "range": [ + 0.01, + 10.0 + ] + }, + "n_estimators": { + "type": "int", + "default": 10, + "range": [ + 1, + 500 + ] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [ + 1, + 30 + ] + }, + "criterion": { + "type": "str", + "default": "friedman_mse", + "values": [ + "friedman_mse", + "friedman_mae" + ] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [ + 2, + 100 + ] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [ + 1, + 100 + ] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [ + 0.0, + 10.0 + ] + }, + "subsample": { + "type": "float", + "default": 1.0, + "range": [ + 0.001, + 100.0 + ] + }, + "max_features": { + "type": "str", + "default": null, + "values": [ + null, + "auto", + "log2", + "sqrt" + ] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [ + 2, + 1000 + ] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [ + 0.0, + 1000.0 + ] + }, + "validation_fraction": { + "type": "float", + "default": 0.1, + "range": [ + 0.0, + 1.0 + ] + }, + "n_iter_no_change": { + "type": "int", + "default": null, + "range": [ + 1, + 1000 + ] + }, + "tol": { + "type": "float", + "default": 0.0001, + "range": [ + 0.0, + 2.0 + ] + } + } + } +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/sklearn.metrics.accuracy_score.json b/zephyr_ml/primitives/jsons/sklearn.metrics.accuracy_score.json new file mode 100644 index 0000000..ab0a1da --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.metrics.accuracy_score.json @@ -0,0 +1,45 @@ +{ + "name": "sklearn.metrics.accuracy_score", + "contributors": [ + "Raymond Pan " + ], + "documentation": "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html", + "description": "Accuracy classification score.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "sklearn.metrics.accuracy_score", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_pred", + "type": "ndarray" + } + ], + "output": [ + { + "name": "score", + "type": "float or int" + } + ] + }, + + "hyperparameters": { + "fixed": { + "normalize": { + "type": "bool", + "default": true + }, + "sample_weight": { + "type": "ndarray", + "default": null + } + } + } + +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/sklearn.metrics.f1_score.json b/zephyr_ml/primitives/jsons/sklearn.metrics.f1_score.json new file mode 100644 index 0000000..4097d21 --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.metrics.f1_score.json @@ -0,0 +1,56 @@ +{ + "name": "sklearn.metrics.f1_score", + "contributors": [ + "Raymond Pan " + ], + "documentation": "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html", + "description": "Compute the F1 score, also known as balanced F-score or F-measure.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "sklearn.metrics.f1_score", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_pred", + "type": "ndarray" + } + ], + "output": [ + { + "name": "score", + "type": "float or int" + } + ] + }, + "hyperparameters": { + "fixed": { + "labels": { + "type": "ndarray", + "default": null + }, + "pos_label": { + "type": "int, float, bool or str", + "default": 1 + }, + "average": { + "type": "str", + "default": "binary" + }, + "sample_weight": { + "type": "ndarray", + "default": null + }, + "zero_division": { + "type": "int, float or str", + "default": "warn" + } + } + } + +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/sklearn.metrics.precision_score.json b/zephyr_ml/primitives/jsons/sklearn.metrics.precision_score.json new file mode 100644 index 0000000..9c2bc90 --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.metrics.precision_score.json @@ -0,0 +1,58 @@ +{ + "name": "sklearn.metrics.precision_score", + "contributors": [ + "Raymond Pan " + ], + "documentation": "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html", + "description": "Compute the precision.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "sklearn.metrics.precision_score", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_pred", + "type": "ndarray" + } + ], + "output": [ + { + "name": "score", + "type": "float or int" + } + ] + + }, + + "hyperparameters": { + "fixed": { + "labels": { + "type": "ndarray", + "default": null + }, + "pos_label": { + "type": "int, float, bool or str", + "default": 1 + }, + "average": { + "type": "str", + "default": "binary" + }, + "sample_weight": { + "type": "ndarray", + "default": null + }, + "zero_division": { + "type": "int, float or str", + "default": "warn" + } + } + } + +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/sklearn.metrics.recall_score.json b/zephyr_ml/primitives/jsons/sklearn.metrics.recall_score.json new file mode 100644 index 0000000..37d5cce --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.metrics.recall_score.json @@ -0,0 +1,57 @@ +{ + "name": "sklearn.metrics.recall_score", + "contributors": [ + "Raymond Pan " + ], + "documentation": "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html", + "description": "Compute the recall.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "sklearn.metrics.recall_score", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_pred", + "type": "ndarray" + } + ], + "output": [ + { + "name": "score", + "type": "float or int" + } + ] + }, + + "hyperparameters": { + "fixed": { + "labels": { + "type": "ndarray", + "default": null + }, + "pos_label": { + "type": "int, float, bool or str", + "default": 1 + }, + "average": { + "type": "str", + "default": "binary" + }, + "sample_weight": { + "type": "ndarray", + "default": null + }, + "zero_division": { + "type": "int, float or str", + "default": "warn" + } + } + } + +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/sklearn.model_selection.train_test_split.json b/zephyr_ml/primitives/jsons/sklearn.model_selection.train_test_split.json new file mode 100644 index 0000000..482cf6e --- /dev/null +++ b/zephyr_ml/primitives/jsons/sklearn.model_selection.train_test_split.json @@ -0,0 +1,69 @@ +{ + "name": "zephyr_ml.primitives.preprocessing.train_test_split", + "contributors": [ + "Raymond Pan rpan@mit.edu" + ], + "documentation": "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html", + "description": "Split feature matrix and labels into random train and test subsets.", + "classifiers": { + "type": "preprocessor" + }, + "modaliaties": [], + "primitive": "sklearn.model_selection.train_test_split", + "produce": { + "args": [ + { + "name": "X", + "type": "pandas.DataFrame" + }, + { + "name": "y", + "type": "pandas.DataFrame" + } + + ], + "output": [ + { + "name": "X_train", + "type": "pandas.DataFrame" + }, + { + "name": "X_test", + "type": "pandas.DataFrame" + }, + { + "name": "y_train", + "type": "pandas.DataFrame" + }, + { + "name": "y_test", + "type": "pandas.DataFrame" + } + + ] + }, + "hyperparameters": { + "fixed": { + "test_size": { + "type": "float or int", + "default": null + }, + "train_size": { + "type": "float or int", + "default": null + }, + "random_state": { + "type": "int", + "default": null + }, + "shuffle": { + "type": "bool", + "default": true + }, + "stratify": { + "type": "list", + "default": null + } + } + } +} \ No newline at end of file diff --git a/zephyr_ml/primitives/jsons/xgboost.XGBClassifier.json b/zephyr_ml/primitives/jsons/xgboost.XGBClassifier.json index a831109..0440aac 100644 --- a/zephyr_ml/primitives/jsons/xgboost.XGBClassifier.json +++ b/zephyr_ml/primitives/jsons/xgboost.XGBClassifier.json @@ -36,7 +36,7 @@ ], "output": [ { - "name": "y", + "name": "y_proba", "type": "array" } ] diff --git a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.FindThreshold.json b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.FindThreshold.json index e522be5..7420061 100644 --- a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.FindThreshold.json +++ b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.FindThreshold.json @@ -18,7 +18,7 @@ "type": "ndarray" }, { - "name": "y_pred", + "name": "y_proba", "type": "ndarray" } ] @@ -27,13 +27,13 @@ "method": "apply_threshold", "args": [ { - "name": "y_pred", + "name": "y_proba", "type": "ndarray" } ], "output": [ { - "name": "y", + "name": "y_pred", "type": "ndarray" }, { diff --git a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json new file mode 100644 index 0000000..766ca5f --- /dev/null +++ b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.confusion_matrix.json @@ -0,0 +1,49 @@ +{ + "name": "zephyr_ml.primitives.postprocessing.confusion_matrix", + "contributors": ["Raymond Pan "], + "description": "Create and plot confusion matrix.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "zephyr_ml.primitives.postprocessing.confusion_matrix", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_pred", + "type": "ndarray" + } + ], + "output": [ + { + "name": "confusion_matrix", + "type": "ndarray" + }, + { + "name": "figure", + "type": "matplotlib.figure.Figure" + } + ] + }, + + "hyperparameters": { + "fixed": { + "labels": { + "type": "ndarray", + "default": null + }, + "sample_weight": { + "type": "ndarray", + "default": null + }, + "normalize": { + "type": "str", + "default": null + } + } + } +} diff --git a/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json new file mode 100644 index 0000000..778bde9 --- /dev/null +++ b/zephyr_ml/primitives/jsons/zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve.json @@ -0,0 +1,49 @@ +{ + "name": "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", + "contributors": ["Raymond Pan "], + "description": "Calculate ROC AUC score and plot curve.", + "classifiers": { + "type": "helper" + }, + "modalities": [], + "primitive": "zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve", + "produce": { + "args": [ + { + "name": "y_true", + "type": "ndarray" + }, + { + "name": "y_proba", + "type": "ndarray" + } + ], + "output": [ + { + "name": "score", + "type": "float" + }, + { + "name": "figure", + "type": "matplotlib.figure.Figure" + } + ] + }, + + "hyperparameters": { + "fixed": { + "pos_label": { + "type": "int, float, bool or str", + "default": null + }, + "sample_weight": { + "type": "ndarray", + "default": null + }, + "drop_intermediate": { + "type": "bool", + "default": true + } + } + } +} diff --git a/zephyr_ml/primitives/postprocessing.py b/zephyr_ml/primitives/postprocessing.py index 3b7aa60..2ae0af1 100644 --- a/zephyr_ml/primitives/postprocessing.py +++ b/zephyr_ml/primitives/postprocessing.py @@ -1,10 +1,14 @@ """ Postprocessing functions. """ + import logging +import matplotlib.pyplot as plt import numpy as np +import seaborn as sns import sklearn +from sklearn import metrics LOGGER = logging.getLogger(__name__) @@ -32,42 +36,42 @@ class FindThreshold: String representing which metric to use. """ - def __init__(self, metric='f1'): - self._metric = 'f1' + def __init__(self, metric="f1"): + self._metric = "f1" self._threshold = None - def fit(self, y_true, y_pred): + def fit(self, y_true, y_proba): """Find the threshold that obtains the best metric value. Args: y_true (Series or ndarray): ``pandas.Series`` or ``numpy.ndarray`` ground truth target values. - y_pred (Series or ndarray): - ``pandas.Series`` or ``numpy.ndarray`` predicted target valeus. + y_proba (Series or ndarray): + ``pandas.Series`` or ``numpy.ndarray`` predicted target values' probabilities. """ - if y_pred.ndim > 1: - y_pred = y_pred[:, 1] + if y_proba.ndim > 1: + y_proba = y_proba[:, 1] RANGE = np.arange(0, 1, 0.01) scores = list() scorer = METRICS[self._metric] for thresh in RANGE: - y = [1 if x else 0 for x in y_pred > thresh] + y = [1 if x else 0 for x in y_proba > thresh] scores.append(scorer(y_true, y)) threshold = RANGE[np.argmax(scores)] - LOGGER.info(f'best threshold found at {threshold}') + LOGGER.info(f"best threshold found at {threshold}") self._threshold = threshold self._scores = scores - def apply_threshold(self, y_pred): + def apply_threshold(self, y_proba): """Apply threshold on predicted values. Args: y_pred (Series): - ``pandas.Series`` predicted target valeus. + ``pandas.Series`` predicted target values' probabilities. Return: tuple: @@ -75,8 +79,65 @@ def apply_threshold(self, y_pred): * detected float value for threshold. * list of scores obtained at each threshold. """ - if y_pred.ndim > 1: - y_pred = y_pred[:, 1] + if y_proba.ndim > 1: + y_proba = y_proba[:, 1] - binary = [1 if x else 0 for x in y_pred > self._threshold] + binary = [1 if x else 0 for x in y_proba > self._threshold] return binary, self._threshold, self._scores + + +def confusion_matrix( + y_true, + y_pred, + labels=None, + sample_weight=None, + normalize=None): + conf_matrix = metrics.confusion_matrix( + y_true, y_pred, labels=labels, sample_weight=sample_weight, normalize=normalize + ) + fig = plt.figure() + ax = fig.add_axes(sns.heatmap(conf_matrix, annot=True, cmap="Blues")) + + ax.set_title("Confusion Matrix\n") + ax.set_xlabel("\nPredicted Values") + ax.set_ylabel("Actual Values") + + ax.xaxis.set_ticklabels(["False", "True"]) + ax.yaxis.set_ticklabels(["False", "True"]) + + return conf_matrix, fig + + +def roc_auc_score_and_curve( + y_true, y_proba, pos_label=None, sample_weight=None, drop_intermediate=True +): + if y_proba.ndim > 1: + y_proba = y_proba[:, 1] + fpr, tpr, _ = metrics.roc_curve( + y_true, + y_proba, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + ns_probs = [0 for _ in range(len(y_true))] + ns_fpr, ns_tpr, _ = metrics.roc_curve( + y_true, + ns_probs, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + + auc = metrics.roc_auc_score(y_true, y_proba) + fig, ax = plt.subplots(1, 1) + + ax.plot(fpr, tpr, "ro") + ax.plot(fpr, tpr) + ax.plot(ns_fpr, ns_tpr, linestyle="--", color="green") + + ax.set_ylabel("True Positive Rate") + ax.set_xlabel("False Positive Rate") + ax.set_title("AUC: %.3f" % auc) + + return auc, fig diff --git a/zephyr_ml/primitives/preprocessing.py b/zephyr_ml/primitives/preprocessing.py new file mode 100644 index 0000000..0add5f3 --- /dev/null +++ b/zephyr_ml/primitives/preprocessing.py @@ -0,0 +1,25 @@ +""" +Preprocessing functions +""" + +import sklearn.model_selection + + +def train_test_split( + X, + y, + test_size=None, + train_size=None, + random_state=None, + shuffle=True, + stratify=None, +): + """ + Wrapper over sklearn.model_selection.train_test_split() + Used to split only 2 arrays at once: X (features) and y (labels) + + Split arrays or matrices into random train and test subsets. + """ + return sklearn.model_selection.train_test_split( + X, y, test_size, train_size, random_state, shuffle, stratify + )