# Setup and libraries

## Load the needed libraries

These are the libraries I will be using for this notebook

In [157]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import json

%matplotlib inline

# Write files

In [168]:
# This section produces the data for a generative model of the Lunar Lander
# Create a single dataframe with all the data
# each row is a single run of a single model
# each column is a single timestep of a single variable

# NOTE:  There needs to be some thinking here.  I mean, while the position,
# velocity, and angle are all continuous, the thrust is not.  So, we need to
# thinkg about how to interpolate the thrust. I think the data in this case
# needs to be "ragged" in the sense that each row has a different number of
# entries.  However, perhaps we can also just look at the "shortest" run and
# truncate all the other runs to that length. 


# TODO:  This is getting close, but is not there yet.  I want things like
# 'x,x,x' to be something like 'x1,x2,x3' so that I can use the autoencoder
# more easily.  Is that a matter of combining the columns?  I think so.  
# How about keeping a dict to map times to indices?  That would work I think.

def uniform_data_for_autoencoder(info, entries_per_run=100):
    all_data = []
    for model_name in info['models']:
        for run_idx in range(info['number_of_trajectories']):
            df = pd.read_parquet(f'data/lander/{model_name}_{run_idx}_trajectory.parquet')  
            # There is a nice way to resample the data in pandas, but it requires a datetime index
            df['timestamp'] = pd.to_datetime(df.index, unit='s')
            df.set_index('timestamp', inplace=True)

            # However, we just want the number of seconds since the start of the run
            # so we just keep that as a column
            df['time_seconds'] = (df.index - df.index[0]).total_seconds()

            # We now compute the delta t that gives us 100 total sample points for each run
            # We do this by taking the total time of the run and dividing by 100
            total_time = df.index[-1] - df.index[0]
            delta_t = total_time / entries_per_run
            df = df.resample(delta_t).interpolate()

            # Ok, now things are resampled and interpolated, but we need to get rid of the
            # datetime index and replace it with a simple integer index and the number of seconds.
            #multi_index = pd.MultiIndex.from_arrays([np.arange(len(df))], names=('idx',))
            df.index = np.arange(len(df))

            # Melt makes a mutli-column dataframe into a single column dataframe (well, actually
            # a pair of columns, one for the variable name and one for the value).  
            df_melt = pd.melt(df, 
                  value_vars=['x', 'y', 'vx', 'vy', 'theta', 'vtheta', 'time_seconds'], 
                  var_name='parameter', 
                  ignore_index=False, 
                  value_name=(model_name, run_idx),
                  col_level=0)

            # We now have a dataframe with a single column, but we want to make the index
            # better for later slicing.  In particular, we want to make the index a multi-index
            # with the first index being the row number and the second index being the parameter
            # name.  This will make it easy to slice out all the x values, for example.
            df_melt.index = pd.MultiIndex.from_arrays([df_melt.index, df_melt['parameter']],names=('idx', 'parameter'))
            df_melt.drop(columns=['parameter'], inplace=True)

            # We now have a dataframe with a single column, but we want each experiment to be
            # a single row.  
            experiment = df_melt.T

            # Last but not least we want to add the model name and run index to the dataframe
            experiment.index = pd.MultiIndex.from_tuples(experiment.index, names=('run_idx', 'experiment'))
            
            all_data.append(experiment)
    all_data = pd.concat(all_data)
    return all_data
info = json.load(open('data/lander/info.json', 'r'))
all_data = uniform_data_for_autoencoder(info)

In [174]:
all_data.to_parquet('data/lander/all_data.parquet')

In [170]:
# Example of slicing out x,y values for time stepss 1..4 for all the runs of all the models
all_data.loc[:, (range(1,5),('x','y'))]

Unnamed: 0_level_0,idx,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,parameter,x,y,x,y,x,y,x,y
run_idx,experiment,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
random,0,-0.012747,1.393964,-0.018426,1.385643,-0.024104,1.377323,-0.029783,1.369002
random,1,0.006329,1.401989,0.008917,1.397606,0.011504,1.393223,0.014092,1.388841
random,2,0.000228,1.400376,0.000316,1.395521,0.000404,1.390667,0.000492,1.385813
random,3,-0.009627,1.393494,-0.013567,1.383633,-0.017507,1.373771,-0.021448,1.363909
random,4,-0.012488,1.406938,-0.017625,1.394574,-0.022763,1.382211,-0.027901,1.369847
...,...,...,...,...,...,...,...,...,...
better,11,-0.004091,1.401998,-0.004021,1.387828,-0.003952,1.373659,-0.003882,1.359489
better,12,-0.004860,1.398826,-0.004690,1.384684,-0.004520,1.370542,-0.004350,1.356400
better,13,0.005949,1.387995,0.006160,1.373964,0.006370,1.359933,0.006581,1.345902
better,14,0.019459,1.427793,0.032523,1.436952,0.045586,1.446111,0.058650,1.455270


In [176]:
#  Just the better runs, but all the x values
all_data.loc[('better', slice(None)), (slice(None),('x',))]

Unnamed: 0_level_0,idx,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
Unnamed: 0_level_1,parameter,x,x,x,x,x,x,x,x,x,x,...,x,x,x,x,x,x,x,x,x,x
run_idx,experiment,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
better,0,8.2e-05,0.000679,0.001276,0.001872,0.002469,0.003066,0.003662,0.004259,0.004856,0.005452,...,0.014534,0.014159,0.013784,0.013409,0.013033,0.012658,0.012283,0.011908,0.011533,0.011158
better,1,0.007717,0.026439,0.045162,0.062855,0.080548,0.097449,0.11435,0.130413,0.146477,0.160684,...,-0.08756,-0.08757,-0.087571,-0.087571,-0.087571,-0.087571,-0.087571,-0.087571,-0.087571,-0.087571
better,2,-0.005465,-0.005837,-0.006208,-0.00658,-0.006952,-0.007324,-0.007695,-0.008067,-0.008439,-0.00881,...,-0.039288,-0.039659,-0.040031,-0.040403,-0.040774,-0.041146,-0.041518,-0.041889,-0.042261,-0.042633
better,3,0.003687,0.010454,0.017221,0.023988,0.030755,0.037522,0.044289,0.051055,0.057822,0.064589,...,0.056008,0.056008,0.056008,0.056008,0.056008,0.056008,0.056008,0.056008,0.056008,0.056008
better,4,0.002357,0.003117,0.003877,0.004637,0.005397,0.006157,0.006917,0.007677,0.008437,0.009197,...,0.071524,0.072284,0.073044,0.073804,0.074564,0.075324,0.076084,0.076844,0.077604,0.078364
better,5,0.001581,0.002536,0.003492,0.004447,0.005403,0.006358,0.007314,0.008269,0.009225,0.01018,...,0.018012,0.017247,0.016483,0.015718,0.014954,0.014189,0.013425,0.01266,0.011896,0.011131
better,6,-0.005896,-0.006738,-0.007579,-0.008421,-0.009262,-0.010104,-0.010945,-0.011787,-0.012628,-0.01347,...,-0.08247,-0.083311,-0.084152,-0.084994,-0.085835,-0.086677,-0.087518,-0.08836,-0.089201,-0.090043
better,7,6.8e-05,-0.000168,-0.000404,-0.000641,-0.000877,-0.001114,-0.00135,-0.001587,-0.001823,-0.00206,...,-0.021448,-0.021685,-0.021921,-0.022158,-0.022394,-0.022631,-0.022867,-0.023104,-0.02334,-0.023577
better,8,0.003535,0.004304,0.005073,0.005842,0.006611,0.00738,0.008149,0.008918,0.009687,0.010456,...,0.004291,0.003372,0.002452,0.001533,0.000614,-0.000306,-0.001225,-0.002145,-0.003064,-0.003983
better,9,0.006468,0.006675,0.006881,0.007088,0.007294,0.007501,0.007707,0.007914,0.00812,0.008327,...,0.025261,0.025467,0.025674,0.02588,0.026087,0.026293,0.0265,0.026706,0.026913,0.027119
