In [None]:
import os
import pandas as pd
from pathlib import Path
import re

In [None]:
dataset_dir = '../datasets/nasa'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [None]:
%%bash
cd ../datasets/nasa
rm -f FLTz_2.zip
wget https://c3.nasa.gov/dashlink/static/media/dataset/FLTz_2.zip
unzip FLTz_2.zip

In [None]:
fltz_dir = os.path.join(dataset_dir, 'FLTz')

In [None]:
with open(os.path.join(fltz_dir, 'ParameterList_Header.dat')) as header_file:
    header = [ line.split('%')[0].strip() for line in header_file.readlines() ]

In [None]:
def last_numeric_part(value):
    m = re.search('.*[^0-9]([0-9]+)[^0-9]*$', str(value))
    if m:
        return int(m.group(1))
    return 0

def get_data_files():
    all_data_files = []
    data_dirs = [ d for d in Path(fltz_dir).iterdir() if d.is_dir() ]
    data_dirs = sorted(data_dirs, key=last_numeric_part)
    for data_dir in data_dirs:
        data_files = [ f for f in data_dir.iterdir() if f.is_file() ]
        data_files = sorted(data_files, key=last_numeric_part)
        all_data_files += data_files
    return all_data_files

def read_data(header, data_file):
    # Read CSV as dtype=str so that numeric values are not changed through conversion to a float type 
    return pd.read_csv(str(data_file), header=None, names=header, index_col=False, dtype=str)

dataframes = [ read_data(header, f) for f in get_data_files() ]
df = pd.concat(dataframes).reset_index(drop=True)

In [None]:
# Compute rolling average of velocity on last 10 rows (including current row)
df['rolling_velocity'] = df['g_sen.ub'].astype('float').rolling(10).mean()
df['class'] = df['g_sen.ub'].astype('float').gt(df['rolling_velocity'].shift(1))
# Remove rows without a class.
df = df.dropna()

In [None]:
column_set = [
    #'timeStamp', # Meta-data not relevant for prediction
    #'g_exec.runtime', # Meta-data not relevant for prediction
    #'g_dyn.lat', # Meta-data not relevant for prediction
    #'g_dyn.lon', # Meta-data not relevant for prediction
    #'g_dyn.alt', # Meta-data not relevant for prediction
    'g_sen.phi', # Yaw
    'g_sen.psi', # Roll
    'g_sen.theta', # Pitch
    'g_sen.ias', # Indicated airspeed
    'g_sen.hdgmag', # Heading Magnetic
    #'g_sen.ub', # Velocity - Excluded because it is used to generate the class
    'g_sen.vb', # Lateral velocity
    'g_sen.wb', # Vertical velocity
    'g_sen.pb', # Roll rate
    'g_sen.qb', # Pitch rate
    'g_sen.rb', # Yaw rate
    #'g_aero.ubd', # Forward Body-axis Acceleration - Excluded because it is related to the velocity that generates the class
    'g_aero.vbd', # Lateral Body-axis Acceleration
    'g_aero.wbd', # Vertical Body-axis Accelration
    'g_aero.pbd', # Body-axis Roll Acceleration
    'g_aero.qbd', # Body-axis Pitch Acceleration
    'g_aero.rbd', # Body-axis Yaw Acceleration
    'g_act.posn[1]', # Left Aelieron
    #'g_eos.fail_flag[1]', # Has a constant value
    'g_act.posn[2]', # Right Aelieron
    #'g_eos.fail_flag[2]', # Has a constant value
    'g_act.posn[7]', # Left Elevators
    #'g_eos.fail_flag[7]', # Has a constant value
    'g_act.posn[8]', # Right Elevators
    #'g_eos.fail_flag[8]', # Has a constant value
    #'g_act.posn[9]', # Flaps - Has a constant value
    #'g_eos.fail_flag[9]', # Has a constant value
    'g_act.posn[13]', # Rudder
    #'g_eos.fail_flag[13]', # Has a constant value
    #'g_eng.thrust[0]', # Engine0 Thrust - Excluded because engine thrust is highly predictive on its own
    #'g_eng.thrust[1]', # Engine1 Thrust - Excluded because engine thrust is highly predictive on its own
    'class',
]

df = df[column_set]
row_count = df.shape[0]
shuffled_df = df.sample(frac=1, replace=False, random_state=row_count)

In [None]:
df.to_csv(os.path.join(dataset_dir, 'nasa-concat.csv'), index=False)
shuffled_df.to_csv(os.path.join(dataset_dir, 'nasa-shuffled.csv'), index=False)