In [68]:
'''
Write Modules :

1. Get all HDF files (+)
2. Getting the Unix timestamp of all H5 files and python datetime objects (+)
 Dataset Name, Parent Group Name, HDF File Name, Singleton Value, Size, Shape, Data type
4. Open a HDF File and traverse and add to CSV (+)
5. Progress Bar (+)
6. Write Error Handling Module
7. Caching Module
8. Create a dict for a dataset
9. Searching Modules
10. Query Based Searching
11. Searching Error Handling
12. Get Attributes
13. Load Central CSV Data
'''

import os
import threading
import sys
from datetime import datetime
import pytz
import h5py
import csv
import progressbar
import sys
import multiprocessing
from tempfile import NamedTemporaryFile
import shutil
import pandas as pd
import numpy as np
import csv


filename_split = "_"
CERN_timezone = "Europe/Zurich"
no_of_files_indexed=0
count=0

fieldnames = ['DatasetName', 'GroupName',
              'DatasetAttributes', 'DatasetValue', 'Shape', 'Datatype']


def get_all_files(dir_filepath, file_type):
    all_files_list = os.listdir(dir_filepath)
    all_files = []
    for file in all_files_list:
        file_fields = file.split(".")
        if len(file_fields) > 1 :
            if str(file_fields[1]) == str(file_type):
                all_files.append(file)
    return all_files

def get_all_CSV_files(dir_h5_filepath):
    all_files_list = os.listdir(dir_h5_filepath)
    all_HDF_files = []
    for file in all_files_list:
        if file[-3:] == file_type:
            all_HDF_files.append(file)
    return all_HDF_files


def convert_unix_time_to_datetype(filename):
    filename_array = filename.split(filename_split)
    if(str(filename_array[0]) == "optimized"):
        val=1
    else:
        val=0
    if len(str(filename_array[val])) == 19:
        timestamp = float(filename_array[val])
        utc_dt = datetime.utcfromtimestamp(timestamp // 1e9)
        cern_tz = pytz.timezone(CERN_timezone)
        cern_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(cern_tz)
        cern_dt = cern_tz.normalize(cern_dt)
        return (utc_dt, cern_dt, filename_array[1], filename_array[2])
    
def init_csv_file(dir_h5_filepath, dir_csv_name, HDF_thread_arr, progress, no_of_threads):
    for file in HDF_thread_arr :
        csv_file_name = file[:-3]
        with open(dir_csv_name+csv_file_name+".csv", mode='w') as csv_file:
            writer = csv.writer(csv_file, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow(fieldnames)
            file = h5py.File(dir_h5_filepath+str(file), 'r')
            write_into_CSV(file, writer, str(file))
            global no_of_files_indexed
            no_of_files_indexed = no_of_files_indexed+no_of_threads
            progress.update(no_of_files_indexed)
            csv_file.close()

def init_thread_method(dir_h5_filepath, dir_csv_files, no_of_threads):
    HDF_thread_arr, max_size = divide_datasets_for_threads(dir_h5_filepath, no_of_threads)
    thread_arr = []
    with progressbar.ProgressBar(max_value=max_size, redirect_stdout=True) as progress:
        global no_of_files_indexed
        progress.update(no_of_files_indexed)
        for i in range(0, no_of_threads):
            thread_arr.append(threading.Thread(target=init_csv_file(dir_h5_filepath, dir_csv_files, HDF_thread_arr[i], progress, i), name='t'+str(i)))
            thread_arr[i].start()
        for i in range(0, no_of_threads):
            thread_arr[i].join()

def init_process_method(dir_h5_filepath, dir_csv_files, no_of_threads):
    HDF_thread_arr, max_size = divide_datasets_for_threads(dir_h5_filepath, no_of_threads)
    process_arr = []
    with progressbar.ProgressBar(max_value=max_size, redirect_stdout=True) as progress:
        global no_of_files_indexed
        progress.update(no_of_files_indexed)
        for i in range(0, no_of_threads):
            process_arr.append(multiprocessing.Process(target=init_csv_file, args=(dir_h5_filepath, dir_csv_files, HDF_thread_arr[i], progress, no_of_threads), name='t'+str(i)))
            process_arr[i].start()
        for i in range(0, no_of_threads):
            process_arr[i].join()

def get_dataset_attr(file):
    DatasetAttributes = {}
    for key in file.keys():
        DatasetAttributes[key] = str(file[key])
    if(len(file.keys()) > 0):
        return str(DatasetAttributes)
    else:
        return ""


def divide_datasets_for_threads(dir_h5_filepath, no_of_threads):
    all_HDF_files = get_all_files(dir_h5_filepath,"h5")
    max_size = len(list(all_HDF_files))
    segment_size = int(max_size/no_of_threads)
    HDF_thread_arr = []
    for i in range(0, max_size, segment_size):
        HDF_thread_arr.append(all_HDF_files[i:i+segment_size])
    return HDF_thread_arr, max_size


def create_central_CSV(dir_csv_filepath, dir_csv_central_filepath, central_csv_file):
    all_CSV_files = get_all_files(dir_csv_filepath, "csv")
    rows = []
    similar_rows = []
    for file in all_CSV_files :
        with open(dir_csv_filepath+file, 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            for row in csvreader:
                if row in rows:
                    if row not in similar_rows and row[3] != "":
                        similar_rows.append(row)
                else:
                    rows.append(row)
            csvfile.close()
        optimize_similar_rows = []
        for i in range(0, len(similar_rows)):
            found = False
            for j in range(0, len(similar_rows)):
                if i!=j :
                    if similar_rows[i][0] == similar_rows[j][0] :
                        found = True
                        break
            if found == False :
                optimize_similar_rows.append(similar_rows[i])
    with open(dir_csv_central_filepath+central_csv_file, "w") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(optimize_similar_rows)
        csvfile.close()

def optimize_csv_files(dir_csv_filepath, dir_csv_central_filepath, central_csv_file):
    central_database_name = []
    with open(dir_csv_central_filepath+central_csv_file, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for row in reader:
                central_database_name.append(row)
    central_database_name = central_database_name[1:]
    all_CSV_files = get_all_files(dir_csv_filepath, "csv")
    for file in all_CSV_files :
        with open(dir_csv_filepath+file, 'r') as csvfile :
            reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            with open(dir_csv_filepath+"optimized_"+file, 'w') as csv_file:
                writer = csv.writer(csv_file, delimiter=',',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
                for row in reader:
                    count=0
                    for value in central_database_name:
                        if(row[0] == value[0]):
                            row[1]=count
                            row[2]=""
                            row[3]=""
                            row[4]=""
                            row[5]=""
                        count = count+1
                    writer.writerow(row)
        os.remove(dir_csv_filepath+file)

def get_dataset_value(size, file):
    if(int(size) == 0 or int(size) > 1):
        return ""
    else:
        if( str(file.dtype) == "|S1"):
            return ord(file[0])
        elif "S" in str(file.dtype):
            return str(file[0].decode('utf-8'))
        else :
            return str(file[0])

def write_into_CSV(file, writer, main_file_ptr):
    if(isinstance(file, h5py.Group)):
        for sub in file.keys():
            if(isinstance(file[sub], h5py.Dataset)):
                DatasetAttributes = get_dataset_attr(file[sub].attrs)
                try :
                    size = str(file[sub].size)
                except :
                    size = "0"
                try :
                    dataset_value = get_dataset_value(size, file[sub])
                except :
                    dataset_value = ""
                try :
                    datatype = str(file[sub].dtype)
                except :
                    datatype = ""
                writer.writerow([file[sub].name, file.name, DatasetAttributes, dataset_value, file[sub].shape, datatype])
            elif (isinstance(file[sub], h5py.Group)):
                write_into_CSV(file[sub], writer, main_file_ptr)
        
# Searching Modules !

def open_file(file_path):
    file = open(file_path, "r")
    df = pd.read_csv(file)
    return df

def get_dataset(filename, parentGroup):
    h5File = h5py.File(filename, 'r')
    parentGroup = h5File[parentGroup]
    dataset = {'DatasetName' : h5File.name}
    for file in parentGroup.keys() :
        values = parentGroup[file].name.split("/")
        val = list(parentGroup[file])
        if len(val) == 1 :
            dataset.update({values[len(values)-1] : val[0]})
        else :
            dataset.update({parentGroup.name+"/"+str(values[len(values)-1]) : list(val)})
    return dataset

def search_by_dataset_name(dir_h5_filepath, dir_csv_filepath, dataset_name):
    all_csv_files = get_all_files(dir_csv_filepath, "csv")
    final_solution = []
    for file in all_csv_files :
        df = open_file(dir_csv_filepath+file)
        df = df[df['DatasetName'].str.contains(str(dataset_name))==True]
        final_solution.append([file, df])
    count = 0
    for row in final_solution:
        count = count + 1
        utc_dt, cern_dt, fileptr1, fileptr2 = convert_unix_time_to_datetype(row[0])
        print (str(count)+" "+str(cern_dt))
    if(len(final_solution) > 0):
        val_selected = int(input("Enter Which file to load (1,"+str(count)+")"))
        main_file_name = final_solution[count-1][0].split(".")
        filename = main_file_name[0].split("_")
        if(filename[0] == "optimized"):
            filename = filename[1]+"_"+filename[2]+"_"+filename[3]+".h5"
        else :
            filename = main_file_name[0]+".h5"
        return get_dataset(dir_h5_filepath+filename, final_solution[count-1][1]['GroupName'].values[0])
    
