<h1 align="center">Experimental Preprocessing Pipeline</h1>

<h2>Importing all Packages</h2>

In [1]:
import numpy as np
import random
import numpy as np
import tqdm
import json
import matplotlib.pyplot as plt
import h5py
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin #for GridSearchCv
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pathlib import Path

## Converting Files

In [None]:
class ConvertingFiles():
    """
    Parameters:
    None
    
    Return
    (event_data, event_lengths) (packed np.arrays): data converted into .npy format
    """
    
    def __init__(self, run):
        self.run = run
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        file_h5py = h5py.File(X)
        keys_ls = list(file_h5py.keys())[0]
        keys = file_h5py[keys_ls]
        
        length_run = len(keys)
        event_lengths = np.zeros(length_run,int)
        
        for i,e in enumerate(keys):
            length = len(keys[e])
            if length > 0: #no point with zero events 
                event_lengths[i] = length
                
        valid_mask = event_lengths > 0
        event_lengths = event_lengths[valid_mask] #removing the point with the zero point cloud
        num_valid = np.sum(valid_mask)
        event_data = np.full((num_valid, np.max(event_lengths), 4), np.nan)

        valid_idx = 0 #only need to iterate through non-zero events
        valid_keys = []
        for idx, key in enumerate(tqdm.tqdm(keys, desc="Stripping Point Clouds of ID information")):
            dataset = keys[key]
            if len(dataset) == 0:
                continue
            valid_keys.append(key)
            for n in range(len(dataset)):
                event_data[valid_idx, n] = dataset[n][:4]
            valid_idx += 1
        
        valid_keys = np.array(valid_keys)
        np.save(f"/Volumes/researchEXT/O16/ml models/data_exp_pred/run{self.run}_valid_nonzero_keys.npy",valid_keys)
        
        
        assert all(len(event_data[i]) == np.max(event_lengths) for i in range(len(event_data))), "Array length does not match number of events"
        
        return (event_data,event_lengths)

<h2>Outlier Detection</h2>

In [3]:
class OutlierDetection:
    """
    Parameters
    None

    Return
    (event_data,new_event_lengths) (packed np.arrays): outliers removed packed data

    """
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        data,event_lengths = X
        event_data = np.full(data.shape, np.nan)
        new_event_lengths = np.full_like(event_lengths, np.nan)
        tot_count = 0

        for i in range(len(data)):
            event_points = data[i,:]
            condition = ((-270 <= event_points[:, 0]) & (event_points[:, 0] <= 270) &   
                (-270 <= event_points[:, 1]) & (event_points[:, 1] <= 270) &
                (0 <= event_points[:, 2]) & (event_points[:, 2]  <= 1500))
            allowed_points = event_points[condition] #only allows points that are not outliers

            event_data[i,:len(allowed_points)] = allowed_points #only assigns the valid points to the new array

            new_event_lengths[i] = len(allowed_points)  #original event number minus the number of outliers
            tot_count+=event_lengths[i] - new_event_lengths[i]

        print(f"Number of outlier points removed: {tot_count}") 
        return (event_data,new_event_lengths)

<h2>Up/Down Scaling</h2>

In [4]:
class UpDownScaling(BaseEstimator,TransformerMixin):
    """
    Parameters
    target_size (int): which is the number of point I want to up/doen sample to 

    Return
    new_data (np.array): up/down sampled data with shape (run_events, target_size,4) 

    """
    def __init__(self,target_size,isotope,dimension=4):
        self.target_size = target_size
        self.pcloud_zeros = 0 #count if there are zero points in an event
        self.dimension = dimension 
        self.isotope = isotope

    def fit(self,X,y=None):
        return self 

    def transform(self,X,y=None): #for up/down scaling
        data,event_lengths = X #with shape (file,event_lenghts) X needs to be the only input to preserve the conventions of custom transformer
        len_run = len(data)
        # new_array_name = isotope + '_size' + str(sample_size) + '_sampled'
        new_data = np.full((len_run, self.target_size, self.dimension), np.nan) 

        for i in tqdm.tqdm(range(len_run),desc="Up/Downscaling in progress"): #
            ev_len = event_lengths[i] #length of event-- i.e. number of instances
            if ev_len == 0: #if event length is 0
                print(f"This event has 0 length: {i}")
                self.pcloud_zeros+=1
                continue
            if ev_len > self.target_size: #upsample
                random_points = np.random.choice(ev_len, self.target_size, replace=False)  #choosing the random instances to sample
                for r in range(len(random_points)):  # #only adds random sample_size points 
                    new_data[i,r] = data[i,random_points[r]]

            else:
                new_data[i,:ev_len,:] = data[i,:ev_len,:] #downsample
                need = self.target_size - ev_len
                random_points = np.random.choice(ev_len, need, replace= True if need > ev_len else False) #only repeats points more points needed than event length 
                count = ev_len
                for r in random_points:
                    new_data[i,count] = data[i,r]
                    if np.isnan(new_data[i, count, 0]):
                        print(f"NaN found at event {i}, index {count}") #need to make sure no nans remain
                    count += 1


        
        
        assert self.pcloud_zeros == 0, f"There are {self.pcloud_zeros} events with no points"
        
        assert new_data.shape == (len_run, self.target_size, self.dimension), 'Array has incorrect shape'
        assert not np.isnan(new_data).any(), "NaNs detected in new_data" #very imporant to make sure there are no nans 
        
        print(f"Resampled shape of data: {new_data.shape}")
        return new_data

<h2>Scaling</h2>

In [5]:
class ScalingData(BaseEstimator,TransformerMixin):
    """
    Parameters
    None

    Return
    X (np.array): MinMaxScaler() applied data for all columns

    """
    def __init__(self,dimension=4):
        self.dimension = dimension
        self.scalers = [MinMaxScaler(feature_range=(-1, 1)) for _ in range(dimension)]

    def fit(self,X,y=None):
        for n in range(self.dimension):
            data = X[:, :, n].reshape(-1, 1)
            self.scalers[n].fit(data)
        return self
    
    def transform(self,X,y=None):
        n_dict = {0:"x",1:"y",2:"z",3:"charge"}
        for n in range(self.dimension):
            data = X[:, :, n].reshape(-1, 1) #need to flatted the 2-D array first new shape (num_events*target_size)
            X[:, :, n] = self.scalers[n].transform(data).reshape(X.shape[0], X.shape[1])
            print(f"Scaler min/max for {n_dict[n]}: {self.scalers[n].data_min_[0]}, {self.scalers[n].data_max_[0]}")

        return X

<h2>Forming the pipeline</h2>

In [None]:
target_size = 800
isotope = "16O"
pipeline_1 = Pipeline([
    ("conversion", ConvertingFiles(run)),
    # ("outlier",OutlierDetection()), #getting rid of the outliers
    ("sampler", UpDownScaling(target_size,isotope)),
]) #up/down sampler 

# The `pipeline_2` is a data processing pipeline that consists of the following steps:
pipeline_2 = Pipeline([
    ("scaling", ScalingData()),
]) #scaling (w/ concatonated dataset)

In [18]:
eps = 1e-8

for run in range(104,105):
    if run < 100:
        file_path = f"/Volumes/researchEXT/O16/no_efield/PointcloudLegacy/run_00{run}.h5"
    else:
        file_path = f"/Volumes/researchEXT/O16/no_efield/PointcloudLegacy/run_0{run}.h5"
    file_exists = Path(file_path)
    if file_exists.exists():
        print(f"Pipeline for Run {run}")
        data_static = pipeline_1.fit_transform(file_path)
        transformed_data = pipeline_2.fit_transform(data_static)
        print()
        print(f"The full transformed data shape: {transformed_data.shape}")

        mask = np.any((transformed_data > 1+eps) | (transformed_data < -1-eps), axis=(1, 2))
        indices = np.argwhere(mask)
        print(indices)
        assert len(indices) == 0, "Points remain that are not within range[-1,1]"
        
        np.save(f"/Volumes/researchEXT/O16/ml models/data_exp_pred/run{run}_{isotope}_size{target_size}_test_features.npy", transformed_data) #FILE CHANGE
        
    else:
        print(f"Skipping run {run}, as it does not exist")


Pipeline for Run 104


Stripping Point Clouds of ID information: 100%|██████████| 10608/10608 [00:02<00:00, 3579.05it/s]


['cloud_1927598' 'cloud_1927613' 'cloud_1927621' ... 'cloud_1992837'
 'cloud_1992840' 'cloud_1992841']


TypeError: tuple indices must be integers or slices, not tuple

<h2>Saving to .npy files</h2>