# Setup and libraries

## Load the needed libraries

These are the libraries I will be using for this notebook

In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import json

%matplotlib inline

# Write files

In [3]:
# This section produces the data for a generative model of the Lunar Lander
# Create a single dataframe with all the data
# each row is a single run of a single model
# each column is a single timestep of a single variable

# NOTE:  There needs to be some thinking here.  I mean, while the position,
# velocity, and angle are all continuous, the thrust is not.  So, we need to
# thinkg about how to interpolate the thrust. I think the data in this case
# needs to be "ragged" in the sense that each row has a different number of
# entries.  However, perhaps we can also just look at the "shortest" run and
# truncate all the other runs to that length. 


# TODO:  This is getting close, but is not there yet.  I want things like
# 'x,x,x' to be something like 'x1,x2,x3' so that I can use the autoencoder
# more easily.  Is that a matter of combining the columns?  I think so.  
# How about keeping a dict to map times to indices?  That would work I think.

def uniform_data_for_autoencoder(info, entries_per_run=100):
    all_data = []
    for model_name in info['models']:
        for run_idx in range(info['number_of_trajectories']):
            # If the data is not there, skip it
            try:
                df = pd.read_parquet(f'data/lander/{model_name}_{run_idx}_trajectory.parquet')
            except:
                continue  
            # There is a nice way to resample the data in pandas, but it requires a datetime index
            df['timestamp'] = pd.to_datetime(df.index, unit='s')
            df.set_index('timestamp', inplace=True)

            # However, we just want the number of seconds since the start of the run
            # so we just keep that as a column
            df['time_seconds'] = (df.index - df.index[0]).total_seconds()

            # We now compute the delta t that gives us 100 total sample points for each run
            # We do this by taking the total time of the run and dividing by 100
            total_time = df.index[-1] - df.index[0]
            delta_t = total_time / entries_per_run
            df = df.resample(delta_t).interpolate()

            # Ok, now things are resampled and interpolated, but we need to get rid of the
            # datetime index and replace it with a simple integer index and the number of seconds.
            #multi_index = pd.MultiIndex.from_arrays([np.arange(len(df))], names=('idx',))
            df.index = np.arange(len(df))

            # Melt makes a mutli-column dataframe into a single column dataframe (well, actually
            # a pair of columns, one for the variable name and one for the value).  
            df_melt = pd.melt(df, 
                  value_vars=['x', 'y', 'vx', 'vy', 'theta', 'vtheta', 'time_seconds'], 
                  var_name='parameter', 
                  ignore_index=False, 
                  value_name=(model_name, run_idx),
                  col_level=0)

            # We now have a dataframe with a single column, but we want to make the index
            # better for later slicing.  In particular, we want to make the index a multi-index
            # with the first index being the row number and the second index being the parameter
            # name.  This will make it easy to slice out all the x values, for example.
            df_melt.index = pd.MultiIndex.from_arrays([df_melt.index, df_melt['parameter']],names=('idx', 'parameter'))
            df_melt.drop(columns=['parameter'], inplace=True)

            # We now have a dataframe with a single column, but we want each experiment to be
            # a single row.  
            experiment = df_melt.T

            # Last but not least we want to add the model name and run index to the dataframe
            experiment.index = pd.MultiIndex.from_tuples(experiment.index, names=('run_idx', 'experiment'))
            
            all_data.append(experiment)
    all_data = pd.concat(all_data)
    return all_data
info = json.load(open('data/lander/info.json', 'r'))
all_data = uniform_data_for_autoencoder(info)

In [8]:
all_data.to_parquet('data/lander_all_data.parquet')

In [9]:
# Example of slicing out x,y values for time stepss 1..4 for all the runs of all the models
all_data.loc[:, (range(1,5),('x','y'))]

Unnamed: 0_level_0,idx,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,parameter,x,y,x,y,x,y,x,y
run_idx,experiment,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
random,0,0.000561,1.398726,0.000738,1.393250,0.000914,1.387774,0.001091,1.382298
random,1,0.003602,1.391796,0.005058,1.382432,0.006515,1.373068,0.007971,1.363704
random,2,-0.020722,1.437806,-0.033625,1.454408,-0.046528,1.471010,-0.059432,1.487612
random,3,-0.013278,1.400356,-0.020517,1.384466,-0.027755,1.368576,-0.034993,1.352686
random,4,0.010832,1.385774,0.015690,1.371940,0.020548,1.358106,0.025406,1.344271
...,...,...,...,...,...,...,...,...,...
better,1019,0.002454,1.395517,0.002832,1.381412,0.003210,1.367306,0.003588,1.353201
better,1020,-0.003768,1.394727,-0.004727,1.380628,-0.005686,1.366530,-0.006645,1.352432
better,1021,0.017744,1.430014,0.029783,1.441164,0.041822,1.452314,0.053861,1.463463
better,1022,-0.001521,1.382835,-0.002581,1.357445,-0.003641,1.332054,-0.004701,1.306664


In [10]:
#  Just the better runs, but all the x values
all_data.loc[('better', slice(None)), (slice(None),('x',))]

Unnamed: 0_level_0,idx,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
Unnamed: 0_level_1,parameter,x,x,x,x,x,x,x,x,x,x,...,x,x,x,x,x,x,x,x,x,x
run_idx,experiment,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
better,0,0.007230,0.007206,0.007183,0.007159,0.007135,0.007111,0.007088,0.007064,0.007040,0.007016,...,0.005069,0.005046,0.005022,0.004998,0.004974,0.004951,0.004927,0.004903,0.004879,0.004856
better,1,-0.001349,-0.002106,-0.002863,-0.003620,-0.004377,-0.005134,-0.005890,-0.006647,-0.007404,-0.008161,...,0.010133,0.011336,0.012539,0.013742,0.014945,0.016148,0.017351,0.018554,0.019757,0.020960
better,2,-0.000076,-0.000826,-0.001577,-0.002328,-0.003079,-0.003829,-0.004580,-0.005331,-0.006082,-0.006832,...,-0.007194,-0.006452,-0.005710,-0.004968,-0.004226,-0.003484,-0.002742,-0.002000,-0.001258,-0.000516
better,3,-0.003848,-0.005713,-0.007578,-0.009443,-0.011308,-0.013173,-0.015038,-0.016903,-0.018768,-0.020633,...,-0.041705,-0.040354,-0.039003,-0.037652,-0.036301,-0.034950,-0.033599,-0.032248,-0.030897,-0.029546
better,4,-0.002006,-0.005120,-0.008235,-0.011350,-0.014465,-0.017579,-0.020694,-0.023809,-0.026924,-0.030038,...,-0.064577,-0.064237,-0.063898,-0.063559,-0.063220,-0.062880,-0.062541,-0.062202,-0.061863,-0.061523
better,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
better,1019,0.002076,0.002454,0.002832,0.003210,0.003588,0.003966,0.004344,0.004722,0.005100,0.005478,...,0.036474,0.036852,0.037230,0.037608,0.037986,0.038364,0.038742,0.039120,0.039498,0.039876
better,1020,-0.002809,-0.003768,-0.004727,-0.005686,-0.006645,-0.007604,-0.008563,-0.009523,-0.010482,-0.011441,...,-0.090083,-0.091042,-0.092001,-0.092960,-0.093920,-0.094879,-0.095838,-0.096797,-0.097756,-0.098715
better,1021,0.005705,0.017744,0.029783,0.041822,0.053861,0.065210,0.076559,0.087908,0.099257,0.110532,...,0.009147,0.009073,0.009073,0.009073,0.009073,0.009073,0.009073,0.009073,0.009073,0.009073
better,1022,-0.000462,-0.001521,-0.002581,-0.003641,-0.004701,-0.005760,-0.006820,-0.007880,-0.008940,-0.010000,...,0.001198,0.001235,0.001272,0.001310,0.001347,0.001385,0.001422,0.001460,0.001497,0.001535
