# Convert CMAPSS to Draco Format

In this notebook we download [CMAPSS](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan) data and reformat it as Draco pipelines expect.

In [1]:
import datetime
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

## 1. Download Data

In [2]:
import io
import os
import urllib
import zipfile

DATA_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/CMAPSSData.zip'

response = urllib.request.urlopen(DATA_URL)
bytes_io = io.BytesIO(response.read())

with zipfile.ZipFile(bytes_io) as zf:
    zf.extractall('CMAPSSData')

## 2. Read Data

In [3]:
# columns

index = ['unit number', 'time, in cycles']
setting = ['operational setting {}'.format(i + 1) for i in range(0, 3)]
sensor = ['sensor measurement {}'.format(i + 1) for i in range(0, 21)]

all_columns = index + setting + sensor

In [4]:
train = pd.read_csv('CMAPSSData/train_FD001.txt', sep=' ', header=None)
train = train.dropna(axis=1)
train.columns = all_columns

test = pd.read_csv('CMAPSSData/test_FD001.txt', sep=' ', header=None)
test = test.dropna(axis=1)
test.columns = all_columns

y_test = pd.read_csv('CMAPSSData/RUL_FD001.txt', sep=' ', header=None)
y_test = y_test.dropna(axis=1)

## 3. Create columns

### 3.a create `RUL` column
How do we create **Remaining Useful Life (RUL)** column for the training dataset? We can assume that the last entry in the training dataset is the maximum life expectancy for that unit. Then each cycle we have will decrease by that number.

In [5]:
def get_max(x):
    return cycles_max[x]

cycles_max = train.groupby("unit number")["time, in cycles"].max().to_dict()
cycles_max = train['unit number'].apply(get_max)

train['RUL'] = cycles_max - train["time, in cycles"]

### 3.b create `cutoff_time` column
`cutoff_time` is a datetime column with relation to the `cycle` number. We pick a start date and start incrementing from there.

In [6]:
def get_timestamp(x):
    return start + datetime.timedelta(minutes=x * 10)

start = datetime.datetime(2013, 1, 12)
train['timestamp'] = train['time, in cycles'].apply(get_timestamp)

In [7]:
def get_timestamp_test(x):
    return last[x['unit number']] + datetime.timedelta(minutes=x['time, in cycles'] * 10)

last = train.groupby('unit number').last()['timestamp'].to_dict()
test['timestamp'] = test.apply(get_timestamp_test, axis=1)

### 4. Format Data

make `label_times` have three columns, namely: `['turbine_id', 'cutoff_time', 'target']`.

In [8]:
train_label_times = train[['unit number', 'timestamp', 'RUL']].copy()
train_label_times.columns = ['turbine_id', 'cutoff_time', 'target']

# drop first 24 occurances
train_label_times = train_label_times[train_label_times.groupby('turbine_id').cumcount('turbine_id') > 24]
train_label_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
25,1,2013-01-12 04:20:00,166
26,1,2013-01-12 04:30:00,165
27,1,2013-01-12 04:40:00,164
28,1,2013-01-12 04:50:00,163
29,1,2013-01-12 05:00:00,162


In [9]:
test_label_times = test[['unit number', 'timestamp']].groupby('unit number').last().reset_index()
test_label_times.columns = ['turbine_id', 'cutoff_time']
test_label_times['target'] = np.array(y_test).astype('float32')
test_label_times.head()

Unnamed: 0,turbine_id,cutoff_time,target
0,1,2013-01-13 13:10:00,112.0
1,2,2013-01-14 08:00:00,98.0
2,3,2013-01-14 02:50:00,69.0
3,4,2013-01-14 01:10:00,82.0
4,5,2013-01-14 13:10:00,91.0


In [10]:
reading_columns = ['unit number', 'timestamp'] + setting + sensor
readings = pd.concat([train, test])[reading_columns]
readings = readings.melt(id_vars=['unit number', 'timestamp'])
readings.columns = ['turbine_id', 'timestamp', 'signal_id', 'value']

## 5. Save Data

In [11]:
readings.to_csv('rul_readings.csv.gz', compression='gzip', index=False)
train_label_times.to_csv('rul_train_target_times.csv.gz', compression='gzip', index=False)
test_label_times.to_csv('rul_test_target_times.csv.gz', compression='gzip', index=False)