In [1]:
FILE_HEADER = 'SDATA version 2.0'
SEGMENT_HEADER = '2 2 2 : NX NY NT'
empty_line = ''

# CELL HEADER
NPIXELS = 0
TIMESTAMP = '2008-06-14T14:49:28Z'
HEIGHT_OBS = 70000
NSURF = 0
IFGAS = 0

# The PIXEL structure
ix = 1
iy = 1
icloudy = 1

icol = 1360
irow = 1360

x = -17.0130005
y = 14.4720001 
MASL = 57.0000000  
land_percent = 0.00000000 

# number of available wavelengths 
nwl = 6 
# list of wavelengths, in micrometers
wl = [0.442999989, 0.490000010, 0.564999998, 0.670000017, 0.865000010, 1.01999998] 
# number of types of measurements for each wavelength 
nip = [1, 3, 1, 3, 3, 1]
# list of types of measurements, meas_type(nip, wl)
meas_type = []
# number of valid measurements 
nbvm = []

# solar zenith angle, for each wavelength
sza = []
# viewing zenith angle, θv(nbvm, nip, wl)
vza = []
# relative azimuth angle, Δϕ(nbvm, nip, wln)
raa = []

# measurements, meas(nbvm, nip, wln)
meas = []

# 1 if a covariance matrix is available, 0 otherwise, ifcov(nip, wln)
ifcov = []

# 1 if a vertical profile (mprof) is available, 0 otherwise, ifmp(nip, wln)
ifmp = []

In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
df = pd.read_csv('example_polder_normalize_data.csv')

In [4]:
timestamp_table = df.groupby('TIMESTAMP')['HEIGHT_OBS', 'NPIXELS', 'NSURF', 'IFGAS'].max()
timestamp_dict = timestamp_table.T.to_dict()

In [5]:
pixel_table = df.groupby(['ix', 'iy'])['icloudy', 'icol', 'irow', 'x', 'y', 'MASL', 'land_percent','nwl'].min()
pixel_dict = pixel_table.T.to_dict()

In [6]:
par_list = ['TIMESTAMP','ix', 'iy', 'wl','nip', 'sza', 'meas_type', 'ifcov', 'ifmp']


vza_values = df.groupby(par_list).vza.apply(list).rename('vza')
raa_values = df.groupby(par_list).raa.apply(list).rename('raa')
meas_values = df.groupby(par_list).meas.apply(list).rename('meas')

table = pd.concat([vza_values, raa_values, meas_values], axis=1).reset_index()
table['nbvm'] = table.meas.apply(len)

In [9]:
table.TIMESTAMP.unique().shape

(30,)

In [7]:
dict_of_dicts = {}

for timestamp in timestamp_dict:
    for ix in range(2):
        for iy in range(2):

            r = table[(table['TIMESTAMP'] == timestamp) & (table['ix'] == ix + 1) & (table['iy'] == iy + 1)]
            
            final_dict = {}
            
            # sum of lists
            final_dict['vza'] = r['vza'].sum()
            final_dict['raa'] = r['raa'].sum()
            final_dict['meas'] = r['meas'].sum()


            r_reset = r.drop(['vza','raa','meas'], 1)
            final_dict['nbvm'] = r_reset['nbvm'].tolist()
            final_dict['meas_type'] = r_reset['meas_type'].tolist()
            final_dict['ifcov'] = r_reset['ifcov'].tolist()
            final_dict['ifmp'] = r_reset['ifmp'].tolist()

            wl_gr = r_reset.groupby(['TIMESTAMP','ix', 'iy', 'wl'])

            final_dict['nip'] = wl_gr.meas_type.nunique().tolist()

            wl_gr_df = wl_gr.max().reset_index()

            final_dict['sza'] = wl_gr_df['sza'].tolist()
            final_dict['nip'] = wl_gr_df['nip'].tolist()
            final_dict['wl'] = wl_gr_df['wl'].tolist()

            final_dict.update(pixel_dict[(ix + 1, iy + 1)])
            dict_of_dicts[(timestamp, ix + 1, iy + 1)] = final_dict

In [8]:
# to nestes dict
from collections import defaultdict

success_dict = defaultdict(list)
for key in dict_of_dicts:
    success_dict[key[0]].append({'{}_{}'.format(key[1], key[2]): dict_of_dicts[key]})

In [9]:
with open('intermediate_data/data_Polder.json', 'w') as file:
    json.dump(success_dict, file)
    
    
with open('intermediate_data/timestamp_info_Polder.json', 'w') as file:
    json.dump(timestamp_dict, file)