# **Amazon Lookout for Equipment** - Demonstration on an anonymized compressor dataset
*Part 1: Data preparation*

## Initialization
---
This repository is initially structured as follow:
```
/lookout-equipment-demo/getting_started/
|
├── dataset/                                <<< Original dataset <<<
|   ├── labels.csv
|   ├── tags_description.csv
|   ├── timeranges.txt
|   └── timeseries.zip
|
├── notebooks/
|   ├── 1_data_preparation.ipynb            <<< This notebook <<<
|   ├── 2_dataset_creation.ipynb
|   ├── 3_model_training.ipynb
|   ├── 4_model_evaluation.ipynb
|   ├── 5_inference_scheduling.ipynb
|   └── config.py
|
└── utils/
    ├── aws_matplotlib_light.py
    └── lookout_equipment_utils.py
```

### Notebook configuration update

In [None]:
!pip install --quiet --upgrade tqdm tsia

In [None]:
import boto3
import config
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import shutil
import sys
import tsia

from botocore.client import ClientError
from tqdm import tqdm

In [None]:
sys.path.append('../utils')
import lookout_equipment_utils as lookout

### Parameters

In [None]:
BUCKET          = config.BUCKET
PREFIX_TRAINING = config.PREFIX_TRAINING
PREFIX_LABEL    = config.PREFIX_LABEL
RAW_DATA        = os.path.join('..', 'dataset')
DATA            = os.path.join('..', 'data')
LABEL_DATA      = os.path.join(DATA, 'labelled-data')
TRAIN_DATA      = os.path.join(DATA, 'training-data', 'expander')

os.makedirs(DATA,       exist_ok=True)
os.makedirs(LABEL_DATA, exist_ok=True)
os.makedirs(TRAIN_DATA, exist_ok=True)

In [None]:
if BUCKET == '<<YOUR_BUCKET>>':
    raise Exception('Please update your Amazon S3 bucket name in the config.py file located at the root of this repository and restart the kernel for this notebook.')
    
else:
    # Check access to the configured bucket:
    try:
        s3_resource = boto3.resource('s3')
        s3_resource.meta.client.head_bucket(Bucket=BUCKET)
        print(f'Bucket "{BUCKET}" exists')
        
    # Expose error reason:
    except ClientError as error:
        error_code = int(error.response['Error']['Code'])
        if error_code == 403:
            raise Exception(f'Bucket "{BUCKET}" is private: access is forbidden!')
            
        elif error_code == 404:
            raise Exception(f'Bucket "{BUCKET}" does not exist!')

## Loading datasets of interest


In [None]:
timeranges_fname = os.path.join(DATA, 'timeranges.txt')
shutil.copyfile(os.path.join(RAW_DATA, 'timeranges.txt'), timeranges_fname)
with open(timeranges_fname, 'r') as f:
    timeranges = f.readlines()
    
training_start   = pd.to_datetime(timeranges[0][:-1])
training_end     = pd.to_datetime(timeranges[1][:-1])
evaluation_start = pd.to_datetime(timeranges[2][:-1])
evaluation_end   = pd.to_datetime(timeranges[3][:-1])

print(f'Training period: from {training_start} to {training_end}')
print(f'Evaluation period: from {evaluation_start} to {evaluation_end}')

### Labels


In [None]:
labels_fname = os.path.join(LABEL_DATA, 'labels.csv')
shutil.copyfile(os.path.join(RAW_DATA, 'labels.csv'), labels_fname)
labels_df = pd.read_csv(os.path.join(LABEL_DATA, 'labels.csv'), header=None)
labels_df[0] = pd.to_datetime(labels_df[0])
labels_df[1] = pd.to_datetime(labels_df[1])
labels_df.columns = ['start', 'end']
labels_df.head()

### Time series

In [None]:
timeseries_fname = os.path.join(RAW_DATA, 'timeseries.zip')
!unzip -o $timeseries_fname -d $DATA/training-data

In [None]:
all_tags_fname = os.path.join(DATA, 'training-data', 'expander.parquet')
table = pq.read_table(all_tags_fname)
all_tags_df = table.to_pandas()
del table

print(all_tags_df.shape)
all_tags_df.head()

### Tags

In [None]:
tags_description_fname = os.path.join(RAW_DATA, 'tags_description.csv')
tags_description_df = pd.read_csv(tags_description_fname)
tags_description_df.head()

In [None]:
features = list(tags_description_df.sort_values(by='UOM')['Tag'])

## Dataset overview
---

In [None]:
start = pd.to_datetime('2015-04-05 00:00:00')
end = evaluation_end

df_list = []
feature_groups = dict()
for f in features:
    # Get the unit of measure for the current feature:
    uom = str(list(tags_description_df.loc[tags_description_df['Tag'] == f, 'UOM'])[0])
    
    # We have already some features in this group, add it:
    if uom in feature_groups.keys():
        feature_groups.update({uom: feature_groups[uom] + [f]})
        
    # Otherwise, create this group:
    else:
        feature_groups.update({uom: [f]})
    
    # Add the dataframe to the list:
    current_df = all_tags_df.loc[start:end, [f]]
    current_df = current_df.replace(np.nan, 0.0)
    df_list.append(current_df)

In [None]:
tag = 'signal-028'
tag_df = all_tags_df.loc[start:end, [tag]]
tag_df.columns = ['Value']

fig, axes = lookout.plot_timeseries(
    tag_df, 
    tag, 
    fig_width=20, 
    tag_split=evaluation_start, 
    labels_df=labels_df
)

## Building and uploading the dataset
---
We will structure our S3 bucket like this:
```
s3://sagemaker-lookout-equipment-demo/
|
├── training-data/
|   |
|   ├── subsystem-01
|   |   └── subsystem-01.csv
|   |
|   ├── subsystem-02
|   |   └── subsystem-02.csv
|   |
|   ├── ...
|   |
|   └── subsystem-24
|       └── subsystem-24.csv
|
└── labelled-data/
    └── labels.csv
```

In [None]:
# Process each subsystem one by one:
components = list(tags_description_df['Subsystem'].unique())
progress_bar = tqdm(components)
for component in progress_bar:
    progress_bar.set_description(f'Component {component}')
    progress_bar.refresh()
    
    # Check if CSV file already exist and do not overwrite it:
    component_tags_fname = os.path.join(TRAIN_DATA, f'{component}', f'{component}.csv')
    if not os.path.exists(component_tags_fname):
        # Build the dataframe with all the signal timeseries for the current subsystem:
        component_tags_list = list(tags_description_df[tags_description_df['Subsystem'] == component]['Tag'])
        component_tags_df = all_tags_df[component_tags_list]
        component_tags_df = component_tags_df.reset_index()
        component_tags_df['Timestamp'] = component_tags_df['Timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f')
        
        # Save to disk:
        os.makedirs(os.path.join(TRAIN_DATA, f'{component}'), exist_ok=True)
        component_tags_df.to_csv(component_tags_fname, index=None)

In [None]:
# Uploading training dataset to S3:
training_src_dir = TRAIN_DATA
training_s3_dest_path = f's3://{BUCKET}/{PREFIX_TRAINING}'
!aws s3 cp --recursive $training_src_dir $training_s3_dest_path

In [None]:
# Uploading label dataset to S3:
label_src_fname = os.path.join(LABEL_DATA, 'labels.csv')
label_s3_dest_path = f's3://{BUCKET}/{PREFIX_LABEL}labels.csv'
!aws s3 cp $label_src_fname $label_s3_dest_path