# Prepare Dataset
1. Read Energy df
2. Create timeslice groups

In [1]:
def read_df(energy_input_file):
    """
    Reads and returns the Dataframes
    """
    
    hits_df = pd.read_csv(energy_input_file)
    
    return hits_df
    
    
def add_label(hits_df):
    """
    Add a label of 1 to the hits dataframe to indicate hits class
    """
    
    hits_df['label'] = 1
    
    return hits_df


def rename_columns(hits_df):
    """
    Rename columns in hits df to match the noise df
    from main work
    """
    return hits_df.rename(columns={'h.dom_id': 'dom_id',
                                   'h.pmt_id': 'pmt_id',
                                   'h.tot': 'tot',
                                   'h.t': 'time',
                                   'h.pos.x': 'pos_x',
                                   'h.pos.y' : 'pos_y',
                                   'h.pos.z': 'pos_z',
                                   'h.dir.x': 'dir_x',
                                   'h.dir.y': 'dir_y',
                                   'h.dir.z': 'dir_z'})
    


def add_energy_to_noise(noise_df):
    """
    ***Only use if adding Noise DF to Hits DF***
    
    Add an energy value of 0 to nose dataframe
    to indicate noise particle
    """
    
    noise_df["energy"] = 0
    
    return noise_df


def combine_dataframes(hits_df, noise_df):
    """
    ***Only use if adding Noise DF to Hits DF***
    
    Merges two dataframes (hits with energy values and noise)
    """
    df = pd.concat([hits_df, noise_df])
    
    return df
    
    
def cleanup_time(df):
    """
    Remove Negative Time and sort
    """
    df = df[df.time >= 0.0]
    df = df.sort_values(by=['time'])
    
    return df
    
    
def timeslice_grouping(df):
    """
    Group dataframe into time range of 15000ns
    """
    time_range = pd.Series(list(range(0, 1000000000, 15000)))
    df["group"] = pd.cut(df.time,
                         list(time_range),
                         include_lowest=True,
                         labels=False)
    
    return df
    
    
def save_dataframe(df, output_path):
    """
    Saves dataframe at required Path
    """
    df.to_csv(output_path + 'df.csv', index=False )

In [2]:
if __name__ == '__main__':
    import pandas as pd
    import numpy as np
    import matplotlib 
    import sys

    energy_input_file = "../../data/energy/energy_data.csv"
    output_path = "../../data/energy/"
    
    hits_df = read_df(energy_input_file)
    hits_df = add_label(hits_df)
    hits_df = rename_columns(hits_df)
    df = cleanup_time(hits_df)
    df = timeslice_grouping(df)
    save_dataframe(df, output_path)

In [3]:
df

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,h.tot,time,energy,label,group
325995,1054,32658,-16.064,-76.599,112.441,0.000,0.955,-0.296,35,28449.0,13.415,1,1
325997,1054,32671,-16.208,-76.707,112.611,-0.719,0.415,0.558,24,28450.0,13.415,1,1
325992,1053,32639,-16.064,-76.624,122.011,0.000,0.830,0.558,12,28450.0,13.415,1,1
325996,1054,32670,-16.064,-76.624,112.611,0.000,0.830,0.558,54,28450.0,13.415,1,1
325998,1054,32664,-15.968,-76.625,112.559,0.478,0.827,0.296,47,28451.0,13.415,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239871,2016,62496,88.433,-60.180,38.011,0.719,-0.415,0.558,23,99980920.0,12.888,1,6665
239869,1953,60527,30.897,-58.959,121.841,0.000,0.955,-0.296,26,99981723.0,12.888,1,6665
238077,1673,51848,61.686,101.731,47.241,-0.827,0.478,-0.296,25,99985448.0,52.453,1,6665
238075,1566,48536,40.629,67.440,37.959,0.478,0.827,0.296,29,99985904.0,52.453,1,6665
