# Generating a New Parquet File From an Existing One

This notebook shows how you can use the `MoleculeLineList` in conjunction with `line_data_writer` script to filter an existing .par file and create a new .par file based on that filter

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from IPython.display import display

# Import data types from iSLAT
from iSLAT.Modules.DataTypes.MoleculeLineList import MoleculeLineList

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Pandas version: 2.3.3
NumPy version: 2.4.2


## 1. Loading Molecular Data

The `MoleculeLineList` class loads HITRAN format `.par` files. The first load parses the file and creates a binary cache for fast subsequent loads.

In [2]:
from iSLAT.Modules.FileHandling import hitran_data_folder_path

# Load H2O line list
h2o_lines = MoleculeLineList(
    molecule_id="H2O",
    filename=hitran_data_folder_path / "data_Hitran_H2O.par"
)

lines_df = h2o_lines.get_pandas_table()
partition_df = h2o_lines.get_partition_table()

display(lines_df)
display(partition_df)

Unnamed: 0,nr,lev_up,lev_low,lam,freq,a_stein,e_up,e_low,g_up,g_low
0,0,0_0_0|10_2_9,0_0_0|9_3_6,933.27661,3.212257e+11,6.177000e-06,1861.25073,1845.83411,63,57
1,1,0_0_1|5_1_5,0_0_1|4_2_2,928.22180,3.229750e+11,8.967000e-06,5865.74316,5850.24268,33,27
2,2,0_2_0|6_5_1,0_2_0|7_4_4,926.64453,3.235247e+11,2.590000e-05,6039.06494,6023.53857,13,15
3,3,0_1_0|14_3_12,0_1_0|13_4_9,926.56085,3.235540e+11,9.288000e-06,6021.03809,6005.50977,87,81
4,4,0_0_0|5_1_5,0_0_0|4_2_2,922.00464,3.251529e+11,1.157000e-05,469.94110,454.33624,11,9
...,...,...,...,...,...,...,...,...,...,...
305556,305556,-2-2-2|3_3_0,0_0_0|4_3_1,0.30001,9.992745e+14,2.005000e-07,48509.87500,552.26367,7,9
305557,305557,-2-2-2|1_1_1,0_0_0|2_2_0,0.30001,9.992758e+14,1.953000e-07,48153.58203,195.90945,3,5
305558,305558,-2-2-2|2_1_1,0_0_0|3_1_2,0.30001,9.992813e+14,3.315000e-07,48207.37500,249.43471,15,21
305559,305559,-2-2-2|3_-3_-3,0_0_0|4_3_2,0.30001,9.992901e+14,5.255000e-08,48508.71484,550.35651,21,27


Unnamed: 0,Temperature,Partition_Function
0,1.0,1.00000
1,2.0,1.00000
2,3.0,1.00010
3,4.0,1.00173
4,5.0,1.00968
...,...,...
4995,4996.0,83927.50000
4996,4997.0,83985.20000
4997,4998.0,84043.00000
4998,4999.0,84100.80000


# 2. Filter lines to only v1-1 lines

In [3]:
# Use a mask to filter lines with lev_up beginning with 0_1_0 and with lev_low beginning with 0_1_0

filtered_lines_df = lines_df[
    lines_df['lev_up'].str.startswith('0_1_0') & lines_df['lev_low'].str.startswith('0_1_0')
]

display(filtered_lines_df)

Unnamed: 0,nr,lev_up,lev_low,lam,freq,a_stein,e_up,e_low,g_up,g_low
3,3,0_1_0|14_3_12,0_1_0|13_4_9,926.56085,3.235540e+11,0.000009,6021.03809,6005.50977,87,81
7,7,0_1_0|5_2_3,0_1_0|6_1_6,891.63464,3.362279e+11,0.000011,2955.20264,2939.06616,33,39
16,16,0_1_0|8_5_4,0_1_0|7_6_1,704.25195,4.256892e+11,0.000044,4200.70020,4180.27051,51,45
20,20,0_1_0|9_6_4,0_1_0|8_7_1,683.32788,4.387241e+11,0.000040,4778.07666,4757.02100,19,17
22,22,0_1_0|8_5_3,0_1_0|7_6_2,680.20728,4.407369e+11,0.000048,4201.41846,4180.26660,17,15
...,...,...,...,...,...,...,...,...,...,...
15272,15272,0_1_0|18_6_12,0_1_0|17_1_17,5.39431,5.557566e+13,0.018740,9220.33887,6553.12891,37,35
15612,15612,0_1_0|13_11_2,0_1_0|12_4_9,5.30813,5.647797e+13,5.915000,8135.96777,5425.45312,81,75
16501,16501,0_1_0|19_8_12,0_1_0|18_1_17,5.09877,5.879695e+13,7.148000,10387.80371,7565.99561,39,37
18991,18991,0_1_0|13_11_2,0_1_0|13_0_13,4.41763,6.786268e+13,0.028260,8135.96777,4879.07373,81,81


# 3. Filter to desired wavelength range

In [4]:
# Use anoter mask to get lines in the 5-20 micron range

filtered_lines_df = lines_df[
    (lines_df['lam'] >= 5) & (lines_df['lam'] <= 20)
]

display(filtered_lines_df)

Unnamed: 0,nr,lev_up,lev_low,lam,freq,a_stein,e_up,e_low,g_up,g_low
4952,4952,1_0_0|7_5_3,0_0_1|6_1_6,19.99927,1.499017e+13,0.002354,6756.25977,6036.84521,15,13
4953,4953,0_0_0|13_6_8,0_0_0|13_3_11,19.99860,1.499067e+13,0.176200,3953.90161,3234.46289,27,27
4954,4954,0_0_1|7_3_5,0_2_0|7_3_4,19.98255,1.500271e+13,0.000379,6551.14404,5831.12744,45,45
4955,4955,0_0_0|12_5_8,0_0_0|12_2_11,19.97516,1.500826e+13,0.092070,3273.75366,2553.47070,75,75
4956,4956,0_1_0|15_2_13,0_0_0|16_7_10,19.97361,1.500943e+13,0.000267,6484.18408,5763.84570,93,99
...,...,...,...,...,...,...,...,...,...,...
16848,16848,1_0_0|4_0_4,0_1_0|4_1_3,5.00113,5.994493e+13,0.245700,5575.28467,2698.38232,9,9
16849,16849,0_2_0|17_3_14,0_1_0|16_4_13,5.00112,5.994506e+13,12.200000,10206.16016,7329.25049,105,99
16850,16850,1_1_0|5_4_1,1_0_0|4_1_4,5.00046,5.995303e+13,0.003071,8456.25684,5578.96484,33,27
16851,16851,0_2_0|11_4_7,0_1_0|12_1_12,5.00042,5.995340e+13,0.000147,7401.65723,4524.34668,69,75


# 4. Combine masks

In [5]:
# Use a mask to get all of the v1-1 lines in the 5-20 micron range

filtered_lines_df = lines_df[
    (lines_df['lev_up'].str.startswith('0_1_0') & lines_df['lev_low'].str.startswith('0_1_0')) &
    (lines_df['lam'] >= 5) & (lines_df['lam'] <= 20)
]

display(filtered_lines_df)
display(filtered_lines_df.describe())

Unnamed: 0,nr,lev_up,lev_low,lam,freq,a_stein,e_up,e_low,g_up,g_low
4964,4964,0_1_0|13_6_7,0_1_0|12_5_8,19.91264,1.505538e+13,22.180000,6392.07178,5669.52783,81,75
4968,4968,0_1_0|11_7_4,0_1_0|11_4_7,19.89919,1.506556e+13,0.119300,5810.36084,5087.32812,69,69
4973,4973,0_1_0|10_10_0,0_1_0|9_9_1,19.88340,1.507753e+13,64.820000,6470.45605,5746.84912,21,19
4974,4974,0_1_0|10_10_1,0_1_0|9_9_0,19.88340,1.507753e+13,64.800000,6470.45605,5746.84912,63,57
4979,4979,0_1_0|9_3_7,0_1_0|8_0_8,19.84255,1.510857e+13,0.924600,4088.18481,3363.08789,19,17
...,...,...,...,...,...,...,...,...,...,...
14358,14358,0_1_0|13_11_2,0_1_0|13_2_11,5.62527,5.329389e+13,4.099000,8135.96777,5578.26514,81,81
14848,14848,0_1_0|14_8_6,0_1_0|13_1_13,5.50097,5.449811e+13,0.003679,7494.57666,4879.08057,29,27
15272,15272,0_1_0|18_6_12,0_1_0|17_1_17,5.39431,5.557566e+13,0.018740,9220.33887,6553.12891,37,35
15612,15612,0_1_0|13_11_2,0_1_0|12_4_9,5.30813,5.647797e+13,5.915000,8135.96777,5425.45312,81,75


Unnamed: 0,nr,lam,freq,a_stein,e_up,e_low,g_up,g_low
count,635.0,635.0,635.0,635.0,635.0,635.0,635.0,635.0
mean,6917.155906,12.992436,25205240000000.0,12.70997,6873.472098,5663.811601,53.453543,50.955906
std,1554.322172,3.642521,8008973000000.0,29.42599,1632.208098,1592.345424,30.137157,28.911231
min,4964.0,5.09877,15055380000000.0,8.26e-07,3461.8999,2614.90649,11.0,9.0
25%,5788.5,9.85711,18742770000000.0,0.009251,5529.80957,4420.96729,27.0,25.0
50%,6699.0,12.85453,23321930000000.0,0.1482,6861.86133,5582.05176,43.0,39.0
75%,7636.0,15.9951,30413830000000.0,4.1275,8135.96777,6814.39453,81.0,75.0
max,16501.0,19.91264,58796950000000.0,149.4,10707.86621,9704.40625,129.0,123.0


# 5. Save Filtered Lines as a New .par File

Use `MoleculeLineList.write_par_file` to save the filtered DataFrame as a correctly formatted `.par` file that can be loaded back into iSLAT.

In [6]:
from pathlib import Path

output_path = Path("output/data_Hitran_H2O_v1-1_5-20um.par")
output_path = output_path.resolve()  # Get absolute path

# Optionally override header fields (all are optional)
header = pd.DataFrame({
    'source': ['Filtered from data_Hitran_H2O.par - v1-1 lines, 5-20 um'],
})

# Write the filtered lines using the original line list's partition function
h2o_lines.write_par_file(
    file_path=output_path,
    header=header,
    lines_df=filtered_lines_df,
)

print(f"Saved {len(filtered_lines_df)} lines to {output_path.name}.")

Saved 635 lines to data_Hitran_H2O_v1-1_5-20um.par.


# 6. Verify the New File

Load the newly created `.par` file back and confirm it matches the filtered data.

In [7]:
# Load the newly written file back in
reloaded = MoleculeLineList(
    molecule_id="H2O_filtered",
    filename=output_path,
)

reloaded_df = reloaded.get_pandas_table()
print(f"Original filtered lines: {len(filtered_lines_df)}")
print(f"Reloaded lines:          {len(reloaded_df)}")
display(reloaded_df)

[CACHE MISS] Parsing H2O_filtered from source file...
Molar_mass: 18.010565
[CACHE SAVED] H2O_filtered cached for faster loading
Original filtered lines: 635
Reloaded lines:          635


Unnamed: 0,nr,lev_up,lev_low,lam,freq,a_stein,e_up,e_low,g_up,g_low
0,4964,0_1_0|13_6_7,0_1_0|12_5_8,19.91264,1.505538e+13,22.180000,6392.07178,5669.52783,81,75
1,4968,0_1_0|11_7_4,0_1_0|11_4_7,19.89919,1.506556e+13,0.119300,5810.36084,5087.32812,69,69
2,4973,0_1_0|10_10_0,0_1_0|9_9_1,19.88340,1.507753e+13,64.820000,6470.45605,5746.84912,21,19
3,4974,0_1_0|10_10_1,0_1_0|9_9_0,19.88340,1.507753e+13,64.800000,6470.45605,5746.84912,63,57
4,4979,0_1_0|9_3_7,0_1_0|8_0_8,19.84255,1.510857e+13,0.924600,4088.18481,3363.08789,19,17
...,...,...,...,...,...,...,...,...,...,...
630,14358,0_1_0|13_11_2,0_1_0|13_2_11,5.62527,5.329389e+13,4.099000,8135.96777,5578.26514,81,81
631,14848,0_1_0|14_8_6,0_1_0|13_1_13,5.50097,5.449811e+13,0.003679,7494.57666,4879.08057,29,27
632,15272,0_1_0|18_6_12,0_1_0|17_1_17,5.39431,5.557566e+13,0.018740,9220.33887,6553.12891,37,35
633,15612,0_1_0|13_11_2,0_1_0|12_4_9,5.30813,5.647797e+13,5.915000,8135.96777,5425.45312,81,75
