# I. Set Up

In [1]:
# PYTHON Imports 
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import math
import statistics
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from pathlib import Path
import glob
import ipywidgets as widgets
from IPython.display import clear_output
import sys
import time
import json
# ASTROPHY Imports
import astropy 
from astropy.table import Table
from astropy.io import fits
from sherpa.astro import ui
# CIAO Imports
import ciao_contrib.runtool
from ciao_contrib.runtool import *
# CUSTOM Imports
from data_extraction_functions import *
from data_exploration_functions import *
from data_representation_functions import *

# Specify global path
global_path = '/Users/steven/Library/Mobile Documents/com~apple~CloudDocs/0-CfA/4-Data/Datasets'
global_folders = list_folders_fun(global_path)

# Define a custom encoder that knows how to handle NumPy arrays
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # convert numpy array to list
        return json.JSONEncoder.default(self, obj)

# Select dataset
set_widget = widgets.Dropdown(options=global_folders[:],value=global_folders[0],description='Set :',disabled=False); set_widget

Dropdown(description='Set :', options=('Full', 'All', 'New', 'Bona'), value='Full')

# II. Load Data

In [2]:
# Set ID
set_id = set_widget.value
# Load eventfiles and properties
df_eventfiles_input = pd.read_csv(f'{global_path}/{set_id}/eventfiles-input-{set_id}.csv')
df_eventfiles_group = df_eventfiles_input.groupby('obsreg_id')
print("Number of Eventfiles: ", df_eventfiles_group.ngroups)


Number of Eventfiles:  95473


# III. Create features

2D E-t Histogram

In [4]:
# Binning Settings
nbins_E = 16
nbins_t = 24#24
normalised = 'minmax'#'minmax' # 'none' 'minmax
# Initialise dictionary lists
feature_list = []
id_list = []
# Initialise counters
count = 0
count_limit = df_eventfiles_group.ngroups

for id_name, dfi in df_eventfiles_group:
    id_list.append(id_name)
    #lc_plotter_fun(dfi,id_name,100)
    #fig,axs=plt.subplots(1,3,figsize=(12,2),constrained_layout = True)
    #plt.subplot(1, 3, 1)
    feature_list.append(hist2D(dfi, id_name, nbins_E, nbins_t,norm = normalised,plot=False))
    count = count+1
    clear_output(wait=True)
    print(f'Counter: {count} of {count_limit}')
print(f'DONE!!!')

# hist_dict = dict(zip(id_list, feature_list))
# with open(f'{global_path}/{set_id}/histEt-{set_id}-nE{nbins_E}-nt{nbins_t}-norm{normalised}.json', 'w') as f:
#     json.dump(hist_dict, f,cls=NumpyEncoder)

hist_dict = dict(zip(id_list, feature_list))
with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-norm{normalised}.pkl', 'wb') as f:
    pickle.dump(hist_dict, f)


Counter: 1500 of 95473


KeyboardInterrupt: 

Z 

In [5]:
# # Binning Settings
# nbins_E = 16
# nbins_t = 24#24
# normalised = 'none'#'minmax' # 'none' 'minmax' 'Z'

# with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-norm{normalised}.pkl', 'rb') as f:
#     hist_dict = pickle.load(f)
# # Flatten histograms in the dictionary and get IDs
# ids = hist_dict.keys()
# mat_list = hist_dict.values()
# mat_list  = np.array([np.array(h).flatten() for h in mat_list])
# # Z
# mu = np.mean(mat_list, axis=0)
# sigma = np.std(mat_list, axis=0)
# z = (mat_list - mu) / sigma
# zscore_list = [zscore.reshape(mat.shape) for zscore, mat in zip(z, mat_list)]
# hist_dict_Z = dict(zip(ids, zscore_list))

# with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-normZ.pkl', 'wb') as f:
#     pickle.dump(hist_dict_Z, f)

Z2

In [17]:
from sklearn.preprocessing import StandardScaler
# Binning Settings
nbins_E = 16
nbins_t = 24#24
normalised = 'none'#'minmax' # 'none' 'minmax' 'Z'

with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-norm{normalised}.pkl', 'rb') as f:
    hist_dict = pickle.load(f)
# Flatten histograms in the dictionary and get IDs
ids = hist_dict.keys()
mat_list = hist_dict.values()
mat_list  = np.array([np.array(h).flatten() for h in mat_list])
# create a StandardScaler object and fit_transform your data
scaler = StandardScaler()
Z_scaled = scaler.fit_transform(mat_list)
zscore_list = [zscore.reshape(mat.shape) for zscore, mat in zip(Z_scaled, mat_list)]
hist_dict_Z = dict(zip(ids, zscore_list))

with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-normStandard.pkl', 'wb') as f:
    pickle.dump(hist_dict_Z, f)

INVERT 

In [7]:
# Binning Settings
nbins_E = 16
nbins_t = 24#24
normalised = 'none'#'minmax' # 'none' 'minmax' 'Z'

with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-norm{normalised}.pkl', 'rb') as f:
    hist_dict = pickle.load(f)
# Flatten histograms in the dictionary and get IDs
ids = hist_dict.keys()
histograms = hist_dict.values()
features = np.array([np.array(h).flatten() for h in histograms])
features[np.isnan(features)] = 0.0

count = 0
total_count = len(features)
inverted_hist = []
for h in features:
    max_val = max(h)
    min_val = min(h)
    inverted_histogram_matrix = max_val - h + min_val
    inverted_hist.append(inverted_histogram_matrix)
    count = count + 1
    clear_output(wait=True)
    print(f'{count}/{total_count}')
print(f'DONE!!!')

hist_dict_invert = dict(zip(ids, inverted_hist))

with open(f'{global_path}/{set_id}/hist2D-{set_id}-nE{nbins_E}-nt{nbins_t}-normINV.pkl', 'wb') as f:
    pickle.dump(hist_dict_invert, f)

95473/95473
DONE!!!


3D Histogram

In [3]:
# Binning Settings
nbins_E = 16 #20 #17
nbins_t = 24#24 #30 #?25
nbins_dt = 16 #35 #26
normalised = 'none'#'minmax' 
# Initialise dictionary lists
feature_list = []
id_list = []
# Initialise counters
count = 0
count_limit = df_eventfiles_group.ngroups

for id_name, dfi in df_eventfiles_group:
    id_list.append(id_name)
    count = count+1
    clear_output(wait=True)
    feature_list.append(hist3D(dfi, id_name, nbins_E, nbins_t,nbins_dt,plot=False, norm=normalised))
    print(f'Counter: {count} of {count_limit}')
print(f'DONE!!!')

hist_dict = dict(zip(id_list, feature_list))
with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-norm{normalised}.pkl', 'wb') as f:
    pickle.dump(hist_dict, f)

Counter: 95473 of 95473
DONE!!!


Z

In [9]:
# Binning Settings
nbins_E = 16
nbins_t = 24#24
nbins_dt = 16#24
normalised = 'none'#'minmax' # 'none' 'minmax

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-norm{normalised}.pkl', 'rb') as f:
    hist_dict = pickle.load(f)
# Flatten histograms in the dictionary and get IDs
ids = hist_dict.keys()
mat_list = hist_dict.values()
mat_list  = np.array([np.array(h).flatten() for h in mat_list])
# Z
mu = np.mean(mat_list, axis=0)
sigma = np.std(mat_list, axis=0)
z = (mat_list - mu) / sigma
zscore_list = [zscore.reshape(mat.shape) for zscore, mat in zip(z, mat_list)]
hist_dict_Z = dict(zip(ids, zscore_list))

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-normZ.pkl', 'wb') as f:
    pickle.dump(hist_dict_Z, f)

Z2

In [15]:
from sklearn.preprocessing import StandardScaler
# Binning Settings
nbins_E = 16
nbins_t = 24#24
nbins_dt = 16#24
normalised = 'none'#'minmax' # 'none' 'minmax

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-norm{normalised}.pkl', 'rb') as f:
    hist_dict = pickle.load(f)
# Flatten histograms in the dictionary and get IDs
ids = hist_dict.keys()
mat_list = hist_dict.values()
mat_list  = np.array([np.array(h).flatten() for h in mat_list])
# create a StandardScaler object and fit_transform your data
scaler = StandardScaler()
Z_scaled = scaler.fit_transform(mat_list)
zscore_list = [zscore.reshape(mat.shape) for zscore, mat in zip(Z_scaled, mat_list)]
hist_dict_Z = dict(zip(ids, zscore_list))

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-normStandard.pkl', 'wb') as f:
    pickle.dump(hist_dict_Z, f)

In [16]:
print(zscore_list[0])

[-5.43912421e-02 -1.22061635e-01 -1.15420430e-01 -9.51335163e-02
 -5.95839078e-02 -2.18183399e-02 -5.86287226e-02 -5.24633872e-02
 -4.45369951e-02 -3.59163510e-02 -3.23870109e-02 -2.91397977e-02
 -2.09787645e-02 -1.80223360e-02 -1.96899561e-02 -3.20549986e-02
 -5.35862661e-02 -1.40966397e-01 -1.43467959e-01 -1.11563802e-01
 -8.93574960e-02 -2.16472609e-02 -7.29563376e-02 -6.36175693e-02
 -5.60550054e-02 -4.73040072e-02 -3.69255752e-02 -3.42707503e-02
 -3.12257455e-02 -2.65002248e-02 -2.26604866e-02 -4.29750825e-02
  3.47055284e-01 -1.56306425e-01 -1.82184856e-01 -1.44052787e-01
 -1.16464685e-01 -2.48394753e-02 -1.00926030e-01 -8.26658389e-02
 -7.37418028e-02 -6.01499614e-02 -4.77814691e-02 -4.54717172e-02
 -3.84583107e-02 -3.39630270e-02 -2.70979067e-02 -5.78093057e-02
  2.27092983e-01 -1.59335862e-01 -1.99788562e-01 -1.77476010e-01
 -1.61305837e-01 -3.29264286e-02 -1.29880059e-01 -1.07883370e-01
 -9.36164815e-02 -7.91949523e-02 -6.63172774e-02 -6.11790501e-02
 -5.23569585e-02 -4.35850

INVERT

In [10]:
# Binning Settings
nbins_E = 16
nbins_t = 24#24
nbins_dt = 16
normalised = 'none'#'minmax' # 'none' 'minmax

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-norm{normalised}.pkl', 'rb') as f:
    hist_dict = pickle.load(f)
# Flatten histograms in the dictionary and get IDs
ids = hist_dict.keys()
histograms = hist_dict.values()
features = np.array([np.array(h).flatten() for h in histograms])
features[np.isnan(features)] = 0.0

count = 0
total_count = len(features)
inverted_hist = []
for h in features:
    max_val = max(h)
    min_val = min(h)
    inverted_histogram_matrix = max_val - h + min_val
    inverted_hist.append(inverted_histogram_matrix)
    count = count + 1
    clear_output(wait=True)
    print(f'{count}/{total_count}')
print(f'DONE!!!')

hist_dict_invert = dict(zip(ids, inverted_hist))

with open(f'{global_path}/{set_id}/hist3D-{set_id}-nE{nbins_E}-nt{nbins_t}-ndt{nbins_dt}-normINV.pkl', 'wb') as f:
    pickle.dump(hist_dict_invert, f)

95473/95473
DONE!!!
