# Combine the Energy Info with Hits DF

1. Read HD5 files
2. Files are: `mc_info` and `mc_hits`
3. Combine `energy` from `mc_info` with `mc_hits`
4. Save new hits df

## Combining Energy with Hits
The combination is based on index positions in `mc_hits`.

In [1]:
def read_hdf5(hdf5_path):
    """
    Reads and returns the HDF5 file using provided key and path
    """
    info_key = '/data/mc_info'
    hits_key = '/data/mc_hits'
    
    mc_info = pd.read_hdf(hdf5_path, key=info_key)
    mc_hits = pd.read_hdf(hdf5_path, key=hits_key)
    
    return mc_info, mc_hits


def add_energy(mc_info, mc_hits):
    """
    Add energy values from the `mc_info` appropriately
    to the `mc_hits`. The `nu.hits.start` and `nu.hits.end`
    values in `mc_info` correspond to the index positions 
    in `mc_hits`. 
    
    Example:
    
    |-----------------|---------------|
    | `nu.hits.start` | `nu.hits.end' |
    |---------------------------------|
    |0                |5              |
    
    correspond to the first five rows in `mc_hits`
    
    **Note**: pd.cut cannout be used in this case since labels (here, energy values) need to 
    be unique
    """
    hits_df = pd.DataFrame()
    
    for idx, row in mc_info.iterrows():
        slice = mc_hits.iloc[int(row['nu.hits.start']): int(row['nu.hits.end'])]
        slice['energy'] = row['nu_E']
        hits_df = pd.concat([hits_df, slice])
        
    return hits_df   

def save_dataframe(hits_df, output_path):
    """
    Saves dataframe at required Path
    """
    hits_df.to_csv(output_path + 'energy_data.csv', index=False )

In [None]:
if __name__ == '__main__':
    import pandas as pd
    import numpy as np
    import matplotlib 
    import sys

    hdf5_path = '../../data/raw/events.h5'
    output_path = '../../data/energy/'
    
    mc_info, mc_hits = read_hdf5(hdf5_path)
    hits_df = add_energy(mc_info, mc_hits)
#     save_dataframe(hits_df, output_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
hits_df

In [7]:
mc_info

Unnamed: 0,nu_E,type,nu_dir.x,nu.dir.y,nu.dir.z,nu.pos.x,nu.pos.y,nu.pos.z,nu.hits.start,nu.hits.end
0,15.540,-14,-0.630831,0.436518,0.641486,-85.337,154.304,35.735,0,5
1,11.458,14,0.070157,0.942066,-0.328009,19.081,169.073,-92.435,5,6
2,11.485,14,0.367514,0.049922,-0.928677,-61.314,130.036,-64.078,6,11
3,11.571,14,-0.177418,0.344584,0.921838,48.457,76.398,112.710,11,15
4,13.148,-14,-0.997387,0.065683,0.030071,138.488,-166.544,52.433,15,17
...,...,...,...,...,...,...,...,...,...,...
5730,73.038,-14,-0.119695,-0.113570,-0.986294,-102.301,10.814,259.625,489453,489455
5731,79.484,14,-0.220923,0.896138,-0.384876,51.252,56.680,72.604,489455,489817
5732,35.602,14,-0.665612,0.267922,0.696548,191.109,-60.389,-7.956,489817,489820
5733,10.022,14,0.868500,-0.195664,0.455438,-126.753,61.647,-71.618,489820,489897
