In [1]:
import os
import numpy as np
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline
import time

# Pylians, to calculate power spectrum
import MAS_library as MASL
import Pk_library as PKL

# Brenda lib
from brenda_lib import cySim_lib as cysim
from brenda_lib import pySim_lib as pysim

In [2]:
# Simulation basic settings
h = 0.67
Ngrid = 128
Npart = Ngrid ** 3
lbox = 400  # 502.5/h #502.5/h #128/h

In [3]:
# Columns of Gadget Snapshots
col_names = ['id', 'X', 'Y', 'Z', 'Vx', 'Vy', 'Vz']

# Define the columns to be read for the Rockstar output
Columns_Halos = ['Particle_ID', 'Px', 'Py', 'Pz', 'PVx', 'PVy', 'PVz', 'Halo_ID', 'HaloType', 'Hx', 'Hy', 'Hz', 'HVx', 'HVy', 'HVz']

In [4]:
# Base paths of the Gadget snapshots and Rockstar output
base_path = '/home/msoumad/Git/L400-N128/'
snapshot_base_path = os.path.join(base_path, 'SnapShots/output_run')
rockstar_base_path = os.path.join(base_path, 'Rockstar_Output/output_run')

In [5]:
# Output directory setup
# Inside Data_Set we will have Training_Set and Testing_Set
data_set_path = os.path.join(base_path, 'Data_Set')
training_set_path = os.path.join(data_set_path, 'Training_Set')
testing_set_path = os.path.join(data_set_path, 'Testing_Set')

# Creating sub directory
for dir_path in [training_set_path, testing_set_path]:
    for sub_dir in ['Bin', 'Ascii']:
        os.makedirs(os.path.join(dir_path, sub_dir), exist_ok=True)

In [6]:
# Code to read, process and save the snapshot and rockstar data
def process_and_save(snapshot_path, halo_path, output_dir, index):
    # Read the snapshot data
    header = pysim.read_header_gadget4(fname=snapshot_path)
    gadget4 = pysim.read_dark_matter_gadget4(fname=snapshot_path)

    # Create snapshot DataFrame
    snapshot = {
        "Particle_ID": gadget4["ids"],
        "Px": gadget4["pos"][:, 0] / h,
        "Py": gadget4["pos"][:, 1] / h,
        "Pz": gadget4["pos"][:, 2] / h,
        "PVx": gadget4["vel"][:, 0],
        "PVy": gadget4["vel"][:, 1],
        "PVz": gadget4["vel"][:, 2],
    }
    snapshot = pd.DataFrame(snapshot)
    snapshot.set_index("Particle_ID", inplace=True)
    snapshot.sort_index(ascending=True, inplace=True)

    # Read the Rockstar halo data
    halos_data = pd.read_csv(halo_path, sep='\s+', low_memory=False, names=Columns_Halos, skiprows=1)
    df_halos = pd.DataFrame(halos_data, columns=Columns_Halos)
    df_halos['Particle_ID'] = df_halos['Particle_ID'].astype(int)
    df_halos.set_index('Particle_ID', inplace=True)

    # Merge the DataFrames
    combined_df = snapshot.merge(df_halos, left_index=True, right_index=True, how='left', suffixes=('', '_halo'))
    combined_df['Host'] = combined_df['Halo_ID'].apply(lambda x: 'yes' if pd.notna(x) else 'no')
    combined_df = combined_df.reset_index()[[
        'Particle_ID', 'Px', 'Py', 'Pz', 'PVx', 'PVy', 'PVz',
        'Host', 'Halo_ID', 'HaloType', 'Hx', 'Hy', 'Hz', 'HVx', 'HVy', 'HVz'
    ]]

    # Formatted index for output file naming
    output_index = f'{index:04d}'

    # Save as ASCII
    ascii_path = os.path.join(output_dir, 'Ascii', f'output_{output_index}.ascii')
    combined_df.to_csv(ascii_path, sep=' ', index=False, na_rep='NaN')
    print(f"Data saved to {ascii_path}")

    # Save as Binary
    bin_path = os.path.join(output_dir, 'Bin', f'output_{output_index}.bin')
    combined_df.to_pickle(bin_path)
    print(f"Data saved to {bin_path}")


In [7]:
# Start calculating time
start_time = time.time()

# Initialize total size accumulator
total_size_mb = 0

# Processing files and saving their exact location
print(f"Processing the files...")
for i in range(150):
    run_str = f'{i:04d}'
    snapshot_file = os.path.join(snapshot_base_path + run_str, 'snapshot_000')
    halo_file = os.path.join(rockstar_base_path + run_str, 'Halo_Details.ascii')
    print(f"Working on file {run_str}...")
    
    if i < 100:
        output_dir = training_set_path
    else:
        output_dir = testing_set_path

    process_and_save(snapshot_file, halo_file, output_dir, i)

    # Calculate file sizes in MB
    ascii_path = os.path.join(output_dir, 'Ascii', f'output_{run_str}.ascii')
    bin_path = os.path.join(output_dir, 'Bin', f'output_{run_str}.bin')

    # Accumulate total size
    total_size_mb += (os.path.getsize(ascii_path) + os.path.getsize(bin_path)) / (1024 * 1024)  # Convert bytes to MB
    
# End time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"File processing finished!.")
# Print total size of all files
print(f"Total size of all output files: {total_size_mb:.2f} MB")
print(f"Total execution time: {elapsed_time:.2f} seconds")


Processing the files...
Working on file 0000...
Allocating memory for 2097152 particles 24.0Mb
Read data in file #0 for 2097152/2097152 particles...
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Ascii/output_0000.ascii
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Bin/output_0000.bin
Working on file 0001...
Allocating memory for 2097152 particles 24.0Mb
Read data in file #0 for 2097152/2097152 particles...
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Ascii/output_0001.ascii
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Bin/output_0001.bin
Working on file 0002...
Allocating memory for 2097152 particles 24.0Mb
Read data in file #0 for 2097152/2097152 particles...
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Ascii/output_0002.ascii
Data saved to /home/msoumad/Git/L400-N128/Data_Set/Training_Set/Bin/output_0002.bin
Working on file 0003...
Allocating memory for 2097152 particles 24.0Mb
Read data in f