In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
df = pd.read_csv('example_polder_normalize_data.csv')

In [3]:
df.columns

Index(['ix', 'iy', 'icloudy', 'icol', 'irow', 'x', 'y', 'MASL', 'land_percent',
       'nwl', 'wl', 'nip', 'meas_type', 'nbvm', 'sza', 'vza', 'raa', 'meas',
       'ifcov', 'ifmp', 'NPIXELS', 'TIMESTAMP', 'HEIGHT_OBS', 'NSURF',
       'IFGAS'],
      dtype='object')

In [4]:
wl_swap = {0.443: 0.37,
           0.49: 0.41,
           0.565: 0.555,
           0.67: 0.865,
           0.865: 1.378,
           1.02: 1.61}

df['wl'] = df['wl'].round(4)
df['wl'] = df['wl'].apply(lambda x: wl_swap[x])

In [5]:
timestamp_table = df.groupby('TIMESTAMP')['HEIGHT_OBS', 'NPIXELS', 'NSURF', 'IFGAS'].max()
timestamp_dict = timestamp_table.T.to_dict()

In [6]:
pixel_table = df.groupby(['ix', 'iy'])['icloudy', 'icol', 'irow', 'x', 'y', 'MASL', 'land_percent'].min()
pixel_dict = pixel_table.T.to_dict()

In [7]:
par_list = ['TIMESTAMP','ix', 'iy', 'wl','nip', 'sza', 'meas_type', 'ifcov', 'ifmp']


vza_values = df.groupby(par_list).vza.apply(list).rename('vza')
raa_values = df.groupby(par_list).raa.apply(list).rename('raa')
meas_values = df.groupby(par_list).meas.apply(list).rename('meas')

table = pd.concat([vza_values, raa_values, meas_values], axis=1).reset_index()
table['nbvm'] = table.meas.apply(len)

In [8]:
table[(table['TIMESTAMP'] =='2008-06-14T14:49:28Z') & (table['ix'] == 1) & (table['iy'] == 1)].head(4)

Unnamed: 0,TIMESTAMP,ix,iy,wl,nip,sza,meas_type,ifcov,ifmp,vza,raa,meas,nbvm
0,2008-06-14T14:49:28Z,1,1,0.37,1,24.109501,41,0,0,"[58.837101000000004, 54.179599800000005, 48.83...","[107.692001, 104.415001, 99.9546967, 93.609298...","[0.18758133100000002, 0.17888978100000003, 0.1...",14
1,2008-06-14T14:49:28Z,1,1,0.41,3,24.109501,41,0,0,"[58.973701500000004, 54.3372002, 49.0166016, 4...","[107.779999, 104.532997, 100.114998, 93.844703...","[0.167410657, 0.15861778, 0.149844468, 0.14162...",14
2,2008-06-14T14:49:28Z,1,1,0.41,3,24.109501,42,0,0,"[58.973701500000004, 54.3372002, 49.0166016, 4...","[107.779999, 104.532997, 100.114998, 93.844703...","[-0.041035451, -0.0329722129, -0.0242278314999...",14
3,2008-06-14T14:49:28Z,1,1,0.41,3,24.109501,43,0,0,"[58.973701500000004, 54.3372002, 49.0166016, 4...","[107.779999, 104.532997, 100.114998, 93.844703...","[-0.046323519199999996, -0.0409283862, -0.0348...",14


In [9]:
table.shape

(1440, 13)

In [10]:
# addapt to scanpol, added spectral channels

one_ch = table[table.nip == 1]
three_ch = table[table.nip == 3]

ls = []
for type_ in [41, 42, 43]:
    t = one_ch.copy()
    t['meas_type'] = type_
    ls.append(t)
    
one_ch_changed = pd.concat(ls)
one_ch_changed['nip'] = 3

table = pd.concat([one_ch_changed, three_ch]).sort_values(['TIMESTAMP', 'ix', 'iy', 'wl']).reset_index(drop=True)

scanpol = table.copy()
scanpol['chanel_name'] = 'scanpol_' + scanpol['wl'].astype(str) + '_' + scanpol['meas_type'].astype(str)

In [15]:
msip_intens = table[table.meas_type == 41].reset_index(drop=True)

#remove one wl because we need only 5
msip_intens = msip_intens[msip_intens.wl != 1.61]

In [30]:
msip_intens.wl.unique()

array([0.443, 0.47 , 0.49 , 0.67 , 0.91 ])

In [18]:
msip_intens = table[table.meas_type == 41].reset_index(drop=True)

#remove one wl because we need only 5
msip_intens = msip_intens[msip_intens.wl != 1.61]

swam_wl_to_MSIP = {0.37: 0.443,
                   0.41: 0.47,
                   0.555: 0.49,
                   0.865: 0.67,
                   1.378: 0.91}

msip_intens.wl = msip_intens.wl.apply(lambda x: swam_wl_to_MSIP[x])

msip_intens['chanel_name'] = 'msip_intens_' + msip_intens['wl'].astype(str) + '_' + msip_intens['meas_type'].astype(str)

msip = msip_intens.copy().sort_values(['TIMESTAMP', 'ix', 'iy', 'wl'])

In [19]:
table = pd.concat([scanpol, msip])

In [32]:
f = msip.groupby('wl').chanel_name.unique().reset_index()
f.index = [i + 1 for i in f.index]
f

Unnamed: 0,wl,chanel_name
1,0.443,[msip_intens_0.443_41]
2,0.47,[msip_intens_0.47_41]
3,0.49,[msip_intens_0.49_41]
4,0.67,[msip_intens_0.67_41]
5,0.91,[msip_intens_0.91_41]


In [31]:
msip.groupby(['wl', 'chanel_name']).meas.count().reset_index()

Unnamed: 0,wl,chanel_name,meas
0,0.443,msip_intens_0.443_41,120
1,0.47,msip_intens_0.47_41,120
2,0.49,msip_intens_0.49_41,120
3,0.67,msip_intens_0.67_41,120
4,0.91,msip_intens_0.91_41,120


In [23]:
dict_of_dicts = {}

for timestamp in timestamp_dict:
    for ix in range(2):
        for iy in range(2):

            r = table[(table['TIMESTAMP'] == timestamp) & (table['ix'] == ix + 1) & (table['iy'] == iy + 1)]
            r = r.sort_values(['TIMESTAMP','ix', 'iy', 'wl', 'meas_type']).reset_index(drop=True)
            final_dict = {}
            
            # sum of lists
            final_dict['vza'] = r['vza'].sum()
            final_dict['raa'] = r['raa'].sum()
            final_dict['meas'] = r['meas'].sum()


            r_reset = r.drop(['vza','raa','meas'], 1)

            final_dict['nbvm'] = r_reset['nbvm'].tolist()
            final_dict['meas_type'] = r_reset['meas_type'].tolist()
            final_dict['ifcov'] = r_reset['ifcov'].tolist()
            final_dict['ifmp'] = r_reset['ifmp'].tolist()

            wl_gr = r_reset.groupby(['TIMESTAMP','ix', 'iy', 'wl'])


            final_dict['nip'] = wl_gr.chanel_name.nunique().tolist()

            wl_gr_df = wl_gr.max().reset_index()

            final_dict['sza'] = wl_gr_df['sza'].tolist()
            final_dict['wl'] = wl_gr_df['wl'].tolist()
            final_dict['nwl'] = wl_gr_df['wl'].nunique()
            

            final_dict.update(pixel_dict[(ix + 1, iy + 1)])
            
            dict_of_dicts[(timestamp, ix + 1, iy + 1)] = final_dict

In [24]:
r_reset.head()

Unnamed: 0,TIMESTAMP,ix,iy,wl,nip,sza,meas_type,ifcov,ifmp,nbvm,chanel_name
0,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,41,0,0,14,scanpol_0.37_41
1,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,42,0,0,14,scanpol_0.37_42
2,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,43,0,0,14,scanpol_0.37_43
3,2008-08-19T14:36:24Z,2,2,0.41,3,19.719,41,0,0,14,scanpol_0.41_41
4,2008-08-19T14:36:24Z,2,2,0.41,3,19.719,42,0,0,14,scanpol_0.41_42


In [27]:
r_reset

Unnamed: 0,TIMESTAMP,ix,iy,wl,nip,sza,meas_type,ifcov,ifmp,nbvm,chanel_name
0,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,41,0,0,14,scanpol_0.37_41
1,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,42,0,0,14,scanpol_0.37_42
2,2008-08-19T14:36:24Z,2,2,0.37,3,19.719,43,0,0,14,scanpol_0.37_43
3,2008-08-19T14:36:24Z,2,2,0.41,3,19.719,41,0,0,14,scanpol_0.41_41
4,2008-08-19T14:36:24Z,2,2,0.41,3,19.719,42,0,0,14,scanpol_0.41_42
5,2008-08-19T14:36:24Z,2,2,0.41,3,19.719,43,0,0,14,scanpol_0.41_43
6,2008-08-19T14:36:24Z,2,2,0.443,3,19.719,41,0,0,14,msip_intens_0.443_41
7,2008-08-19T14:36:24Z,2,2,0.47,3,19.719,41,0,0,14,msip_intens_0.47_41
8,2008-08-19T14:36:24Z,2,2,0.49,3,19.719,41,0,0,14,msip_intens_0.49_41
9,2008-08-19T14:36:24Z,2,2,0.555,3,19.719,41,0,0,14,scanpol_0.555_41


In [28]:
# to nestes dict
from collections import defaultdict

success_dict = defaultdict(list)
for key in dict_of_dicts:
    success_dict[key[0]].append({'{}_{}'.format(key[1], key[2]): dict_of_dicts[key]})

In [29]:
with open('intermediate_data/data_Scanpol+MSIP.json', 'w') as file:
    json.dump(success_dict, file)
    
    
with open('intermediate_data/timestamp_info_Scanpol+MSIP.json', 'w') as file:
    json.dump(timestamp_dict, file)