# Prepare Dataset
1. Read Energy df and Noise DF
2. Join DFs
3. Add 0 for all noise energy
4. Create timeslice groups

In [39]:
def read_df(energy_input_file, noise_input_file):
    """
    Reads and returns the Dataframes
    """
    
    hits_df = pd.read_csv(energy_input_file)
    noise_df = pd.read_csv(noise_input_file)
    
    return hits_df, noise_df
    
    
def add_label(hits_df):
    """
    Add a label of 1 to the hits dataframe to indicate hits class
    """
    
    hits_df['label'] = 1
    
    return hits_df


def rename_columns(hits_df):
    """
    Rename columns in hits df to match the noise df
    """
    return hits_df.rename(columns={'h.dom_id': 'dom_id',
                                   'h.pmt_id': 'pmt_id',
                                   'tot': 'tot',
                                   'h.t': 'time',
                                   'h.pos.x': 'pos_x',
                                   'h.pos.y' : 'pos_y',
                                   'h.pos.z': 'pos_z',
                                   'h.dir.x': 'dir_x',
                                   'h.dir.y': 'dir_y',
                                   'h.dir.z': 'dir_z',
                                   'energy': 'energy',
                                   'label': 'label'})
    
#     return hits_df


def add_energy_to_noise(noise_df):
    """
    Add an energy value of 0 to nose dataframe
    to indicate noise particle
    """
    
    noise_df["energy"] = 0
    
    return noise_df


def combine_dataframes(hits_df, noise_df):
    """
    Merges two dataframes (hits with energy values and noise)
    """
    df = pd.concat([hits_df, noise_df])
    
    return df
    
    
def cleanup_time(df):
    """
    Remove Negative Time and sort
    """
    df = df[df.time >= 0.0]
    df = df.sort_values(by=['time'])
    
    return df
    
    
def timeslice_grouping(df):
    """
    Group dataframe into time range of 15000ns
    """
    time_range = pd.Series(list(range(0, 1000000000, 15000)))
    df["group"] = pd.cut(df.time,
                         list(time_range),
                         include_lowest=True,
                         labels=False)
    
    return df
    
    
def save_dataframe(df, output_path):
    """
    Saves dataframe at required Path
    """
    hits_df.to_csv(output_path + 'df.csv', index=False )

In [42]:
if __name__ == '__main__':
    import pandas as pd
    import numpy as np
    import matplotlib 
    import sys

    energy_input_file = "../../data/energy/energy_data.csv"
    noise_input_file = "../../data/raw/noise.csv"
    output_path = "../../data/energy/"
    
#     hits_df, noise_df = read_df(energy_input_file, noise_input_file)
#     hits_df = add_label(hits_df)
#     hits_df = rename_columns(hits_df)
#     df = combine_dataframes(hits_df, noise_df)
#     df = cleanup_time(df)
#     df = timeslice_grouping(df)
    save_dataframe(df, output_path)

In [41]:
df

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,energy,label,group
7011482,321.0,4.0,-17.661,32.245,65.231,-0.460,-0.266,-0.847,26.0,0.0,0.0,0,0
36188506,1653.0,22.0,11.595,85.465,65.459,-0.955,-0.000,0.296,27.0,0.0,0.0,0,0
6008280,275.0,8.0,-36.464,67.166,160.189,0.415,0.720,-0.556,26.0,0.0,0.0,0,0
36341659,1660.0,22.0,61.660,101.635,169.059,-0.955,-0.000,0.296,26.0,0.0,0.0,0,0
21139713,966.0,15.0,-54.510,-78.323,94.341,-0.827,0.478,-0.296,24.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11438019,523.0,12.0,-57.230,-5.401,196.389,0.415,-0.720,-0.556,28.0,101502104.0,0.0,0,6766
8460308,387.0,11.0,0.724,66.341,121.789,-0.415,-0.720,-0.556,24.0,101516467.0,0.0,0,6767
2062008,95.0,3.0,-26.436,86.737,160.131,-0.460,0.266,-0.847,23.0,101545421.0,0.0,0,6769
16221364,741.0,29.0,-26.931,-21.994,178.511,0.000,-0.830,0.558,27.0,101581891.0,0.0,0,6772
