In [1]:
from ogs6py import ogs
import numpy as np
import ogs6py
import matplotlib.pyplot as plt
import time
import math
import gmsh
import os
import argparse
import re
import time
import csv
import pandas as pd
import glob
import pyvista as pv
from ogstools.msh2vtu import msh2vtu
import shutil
import xml.etree.ElementTree as ET

pi = math.pi
plt.rcParams["text.usetex"] = True

ModuleNotFoundError: No module named 'ogs6py'

# Swelling


## Problem Description

![Schematic view of hydraulic fracturing problem and Boundary conditions](./figures/Model_propagating_straight.png#one-half "Schematic view of hydraulic fracturing problem and Boundary conditions.")

# Input Data

The simulations were run with the properties listed in the Table below.


| **Name**                       | **Range**                 | **Unit**   | **Symbol** |
|--------------------------------|---------------------------|------------|------------|
| _Young's modulus_              | 500 - 2500                | MPa        | $E$        |
| _Poisson's ratio_              | 0.16 - 0.35               | -          | $\nu$      |
| _Maximum swelling pressure_    | 3.2 - 13                  | MPa        | $\sigma$   |
| _Permeability_                 | 1 $\times$ 10$^{-14}$ - 1 $\times$ 10$^{-12}$ | m$^2$ | $k$        |
| _Entry Pressure_     | 1000 - 3500                 | Pa          | $p_b$   |


# Output Directory  and Project File

In [2]:
# file's name
prj_name = "swelling_staufen.prj"

out_dir = os.environ.get("OGS_TESTRUNNER_OUT_DIR", "_out")
os.makedirs(out_dir, exist_ok=True)

In [3]:
# Find all .vtu files in the current directory
vtu_files = glob.glob("*.vtu")

# Copy each .vtu file to the output directory
for vtu_file in vtu_files:
    shutil.copy(vtu_file, out_dir)

print(f"Copied {len(vtu_files)} .vtu files to {out_dir}")

Copied 10 .vtu files to _out


In [4]:
ogs_path ='/home/reza/OGS/ogs/build/bin'
ogs_util_path ='/home/reza/OGS/ogs/build/bin'

# Post-processing

In [5]:
def calculate_principal_stresses(sigma_xx, sigma_yy, sigma_xy):
    # Calculate principal stresses
    lambda_1 = (sigma_xx + sigma_yy) / 2 + np.sqrt(((sigma_xx - sigma_yy) / 2) ** 2 + sigma_xy ** 2)
    lambda_2 = (sigma_xx + sigma_yy) / 2 - np.sqrt(((sigma_xx - sigma_yy) / 2) ** 2 + sigma_xy ** 2)
    lambda_3 = np.zeros_like(lambda_1)  # sigma_zz = 0
    
    # Calculate maximum principal stress
    sigma_max = np.max(np.abs([lambda_1, lambda_2, lambda_3]), axis=0)
    return sigma_max

def calculate_principal_strains(epsilon_xx, epsilon_yy, epsilon_xy):
    # Calculate principal strains
    lambda_1 = (epsilon_xx + epsilon_yy) / 2 + np.sqrt(((epsilon_xx - epsilon_yy) / 2) ** 2 + epsilon_xy ** 2)
    lambda_2 = (epsilon_xx + epsilon_yy) / 2 - np.sqrt(((epsilon_xx - epsilon_yy) / 2) ** 2 + epsilon_xy ** 2)
    lambda_3 = np.zeros_like(lambda_1)  # In 2D, assuming epsilon_zz = 0
    
    # Calculate maximum principal strain
    epsilon_max = np.max(np.abs([lambda_1, lambda_2, lambda_3]), axis=0)
    return epsilon_max

def calculate_displacement_magnitude(disp_x, disp_y):
    disp_magnitude = np.sqrt(disp_x ** 2 + disp_y ** 2)
    return disp_magnitude

In [6]:
def post_processing(prefix, E, nu, K, p_b, swelling_stress_rate):
    reader = pv.get_reader(f"{out_dir}/{prefix}.pvd")

    data = []
    for time_value in reader.time_values:
        reader.set_active_time_value(time_value)
        mesh = reader.read()[0]
        
        xs = mesh.points[:, 0]
        ys = mesh.points[:, 1]
        porosity = mesh.point_data["porosity"]
        saturation = mesh.point_data["saturation"]
        disp = mesh.point_data["displacement"]
        sigma = mesh.point_data["sigma"]
        epsilon = mesh.point_data["epsilon"]

        disp_x = disp[:, 0]
        disp_y = disp[:, 1]
        sigma_xx = sigma[:, 0]
        sigma_yy = sigma[:, 1]
        sigma_zz = sigma[:, 2]
        sigma_xy = sigma[:, 3]
        epsilon_xx = epsilon[:, 0]
        epsilon_yy = epsilon[:, 1]
        epsilon_zz = epsilon[:, 2]
        epsilon_xy = epsilon[:, 3]

        epsilon_max = calculate_principal_strains(epsilon_xx, epsilon_yy, epsilon_xy)
        sigma_max = calculate_principal_stresses(sigma_xx, sigma_yy, sigma_xy)
        disp_magnitude = calculate_displacement_magnitude(disp_x, disp_y)
        
###################
Here we need to disp_y and average_orosity and average_saturation as outputs too.
#################

        E_values = [E] * len(xs)
        nu_values = [nu] * len(xs)
        K_values = [K] * len(xs)
        p_b_values = [p_b] * len(xs)
        swelling_stress_rate_values = [swelling_stress_rate] * len(xs)

        for i in range(len(xs)):
            data.append([
                time_value, E_values[i], nu_values[i], K_values[i], p_b_values[i], 
                swelling_stress_rate_values[i], porosity[i], saturation[i], xs[i], ys[i], 
                disp_magnitude[i], epsilon_max[i], sigma_max[i]
            ])

    with open(f'{out_dir}/{prefix}.csv', 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([
            'time_value', 'E', 'nu', 'K', 'p_b', 'swelling_stress_rate', 
            'porosity', 'saturation', 'xs', 'ys', 'disp_magnitude', 
            'epsilon_max', 'sigma_max'
        ])
        csvwriter.writerows(data)

In [7]:
class OGSModel:
    def __init__(self, prj_file):
        self.prj_file = prj_file
        self.tree = ET.parse(prj_file)
        self.root = self.tree.getroot()

    def _get_root(self):
        return self.root

    def _get_parameter_pointer(self, root, name, parameterpath):
        for parameter in root.findall(parameterpath):
            if parameter.find('name').text == name:
                return parameter
        return None

    def _set_type_value(self, parameterpointer, value, valuetag="values"):
        if parameterpointer is not None:
            value_element = parameterpointer.find(valuetag)
            if value_element is not None:
                value_element.text = ' '.join(map(str, value))
            else:
                ET.SubElement(parameterpointer, valuetag).text = ' '.join(map(str, value))

    def replace_parameter(self, name=None, value=None, valuetag="values"):
        root = self._get_root()
        parameterpath = ".//parameter"
        parameterpointer = self._get_parameter_pointer(root, name, parameterpath)
        self._set_type_value(parameterpointer, value, valuetag=valuetag)

    def replace_property_value(self, media_id, phase_type, property_name, values, valuetag="values"):
        root = self._get_root()
        for medium in root.findall(".//media/medium"):
            if medium.get('id') == media_id:
                for phase in medium.findall(".//phase"):
                    if phase.find('type').text == phase_type:
                        for property in phase.findall(".//property"):
                            if property.find('name').text == property_name:
                                target_element = property.find(valuetag)
                                if target_element is not None:
                                    target_element.text = ' '.join(map(str, values))
                                else:
                                    ET.SubElement(property, valuetag).text = ' '.join(map(str, values))

    def replace_property_value_general(self, xpath_query, value):
        root = self._get_root()
        target_element = root.find(xpath_query)
        if target_element is not None:
            target_element.text = str(value)

    def save(self, output_file):
        self.tree.write(output_file)


# Run the Simulation 


In [8]:
import pyvista as pv
pv.set_plot_theme("document")
if "PYVISTA_HEADLESS" in os.environ:
    pv.start_xvfb()
pv.set_jupyter_backend("static")

def swelling_numerical(E, nu, K, swelling_pressure, p_b, out_dir):
    prefix = f"swelling_E{E[0]}_nu{nu[0]}_K{K[0]}_p_b{p_b}_S{swelling_pressure[0]}"
    Screenoutput = f"log_swelling_E{E[0]}_nu{nu[0]}_K{K[0]}_p_b{p_b}_S{swelling_pressure[0]}"
    print(prefix)
    print(Screenoutput)
    
    model = OGSModel(prj_name)
    model.replace_parameter(name="E_l1", value=E)
    model.replace_parameter(name="nu1", value=nu)
    model.replace_parameter(name="permeability_l1", value=K) 
    model.replace_property_value_general(".//media/medium[@id='1']//property[name='saturation']//p_b", p_b)  # Ensure p_b is used as a scalar
    model.replace_property_value(media_id="1", phase_type="Solid", property_name="swelling_stress_rate", values=swelling_pressure, valuetag="swelling_pressures")

    model.save(f"{out_dir}/{prj_name}")
    print(f"Saved {prj_name}")
    
    model = ogs.OGS(INPUT_FILE=f"{out_dir}/{prj_name}", PROJECT_FILE=f"{out_dir}/{prj_name}", MKL=True, args=f"-o {out_dir}")
    model.replace_text(prefix, xpath="./time_loop/output/prefix")
    model.write_input() 

    t0 = time.time()
    print(">>> OGS started execution ... <<<")
    !{ogs_path}/ogs {out_dir}/{prj_name} -o {out_dir} > {out_dir}/{Screenoutput}
    tf = time.time()
    print(">>> OGS terminated execution  <<< Elapsed time: ", round(tf - t0, 2), " s.")
    
    print("Post-processing")
    post_processing(prefix, E[0], nu[0], K[0], p_b, swelling_pressure[0])

In [9]:
# Most probable values
most_probable_values = {
    'Young’s modulus (E)': 1000,
    'Poisson’s ratio (ν)': 0.2,
    'Maximum swelling pressure (σ)': 8,
    'Permeability (k)': 8e-13,
    'Air entry pressure (p_b)': 2000
}

# Ranges
ranges = {
    'Young’s modulus (E)': (500, 2500),
    'Poisson’s ratio (ν)': (0.16, 0.35),
    'Maximum swelling pressure (σ)': (3.2, 13),
    'Permeability (k)': (1e-14, 1e-12),
    'Air entry pressure (p_b)': (1000, 3500)
}

# Standard deviations as 1/3th of the range
std_devs = {param: (max_val - min_val) / 3 for param, (min_val, max_val) in ranges.items()}

def generate_values(mean, std_dev, min_val, max_val, num_values=35):
    values = np.random.normal(loc=mean, scale=std_dev, size=num_values-2)
    values = np.clip(values, min_val, max_val)
    values = np.concatenate(([min_val], values, [max_val]))
    values = np.unique(values)
    return np.sort(values)

# Generate 30 values for each parameter
generated_values = {}
for param, mean in most_probable_values.items():
    std_dev = std_devs[param]
    min_val, max_val = ranges[param]
    values = generate_values(mean, std_dev, min_val, max_val, num_values=30)
    generated_values[param] = values

# Convert values to Pa
swelling_pressure_values = (generated_values['Maximum swelling pressure (σ)'] * 1e6).tolist()
young_modulus_values = (generated_values['Young’s modulus (E)']* 1e6).tolist()
air_pressure_values = (generated_values['Air entry pressure (p_b)']).tolist()


# Create parameter sets from generated values
parameters = {
    "E": [[val, val, val] for val in young_modulus_values], 
    "nu": [[val, val, val] for val in generated_values["Poisson’s ratio (ν)"].tolist()], 
    "swelling_pressure": [[val, val, val] for val in swelling_pressure_values], 
    "K": [[val, val] for val in generated_values["Permeability (k)"].tolist()], 
    "p_b": [val for val in air_pressure_values], 

}

# Display the generated values for verification
for param, values in parameters.items():
    print(f"{param}:")
    if param == "p_b":
        for value in values:
            print(value)
    else:
        for value in values:
            print(" ".join(map(str, value)))
    print()


E:
500000000.0 500000000.0 500000000.0
716872910.849898 716872910.849898 716872910.849898
744034358.5792155 744034358.5792155 744034358.5792155
760974422.6982183 760974422.6982183 760974422.6982183
840719262.4058143 840719262.4058143 840719262.4058143
873708391.7631403 873708391.7631403 873708391.7631403
957414030.8117517 957414030.8117517 957414030.8117517
1068700534.4471054 1068700534.4471054 1068700534.4471054
1083404623.8212006 1083404623.8212006 1083404623.8212006
1134492595.6567376 1134492595.6567376 1134492595.6567376
1184314628.22201 1184314628.22201 1184314628.22201
1282477905.9881108 1282477905.9881108 1282477905.9881108
1358742621.347392 1358742621.347392 1358742621.347392
1378974482.7046208 1378974482.7046208 1378974482.7046208
1386680413.2883313 1386680413.2883313 1386680413.2883313
1448785560.074596 1448785560.074596 1448785560.074596
1642070150.459228 1642070150.459228 1642070150.459228
1645602521.0555632 1645602521.0555632 1645602521.0555632
2205775304.9739594 220577530

In [None]:
# Most probable values for all parameters
most_probable_E = [1000e6, 1000e6, 1000e6]  # Convert to Pa
most_probable_nu = [0.2, 0.2, 0.2]
most_probable_swelling_pressure = [8e6, 8e6, 8e6]  # Convert to Pa
most_probable_K = [8e-13, 8e-13]
most_probable_p_b = 2000

# Display the parameter combinations
print("=====================================")
print("E (MPa)\t\tν\t\tK (m²)\t\tSwelling Pressure (Pa)\t\tAir entry pressure (Pa)")
print("=====================================")

# Vary E
for E in parameters["E"]:
    print(f"{E[0]:.2f}\t\t{most_probable_nu[0]:.2f}\t\t{most_probable_K[0]:.2e}\t\t{most_probable_swelling_pressure[0]:.2e}\t\t{most_probable_p_b:.2e}")
    swelling_numerical(E, most_probable_nu, most_probable_K, most_probable_swelling_pressure, most_probable_p_b, out_dir)

# Vary nu
for nu in parameters["nu"]:
    print(f"{most_probable_E[0]:.2f}\t\t{nu[0]:.2f}\t\t{most_probable_K[0]:.2e}\t\t{most_probable_swelling_pressure[0]:.2e}\t\t{most_probable_p_b:.2e}")
    swelling_numerical(most_probable_E, nu, most_probable_K, most_probable_swelling_pressure, most_probable_p_b, out_dir)

# Vary swelling_pressure
for swelling_pressure in parameters["swelling_pressure"]:
    print(f"{most_probable_E[0]:.2f}\t\t{most_probable_nu[0]:.2f}\t\t{most_probable_K[0]:.2e}\t\t{swelling_pressure[0]:.2e}\t\t{most_probable_p_b:.2e}")
    swelling_numerical(most_probable_E, most_probable_nu, most_probable_K    , swelling_pressure, most_probable_p_b, out_dir)

# Vary K
for K in parameters["K"]:
    print(f"{most_probable_E[0]:.2f}\t\t{most_probable_nu[0]:.2f}\t\t{K[0]:.2e}\t\t{most_probable_swelling_pressure[0]:.2e}\t\t{most_probable_p_b:.2e}")
    swelling_numerical(most_probable_E, most_probable_nu, K, most_probable_swelling_pressure, most_probable_p_b, out_dir)

# Vary p_b
for p_b in parameters["p_b"]:
    print(f"{most_probable_E[0]:.2f}\t\t{most_probable_nu[0]:.2f}\t\t{most_probable_K[0]:.2e}\t\t{most_probable_swelling_pressure[0]:.2e}\t\t{p_b:.2e}")
    swelling_numerical(most_probable_E, most_probable_nu, most_probable_K, most_probable_swelling_pressure, p_b, out_dir)


# Combine  all valid CSV files into a single CSV file

In [None]:
# Combine all CSV files
csv_files = glob.glob(f"{out_dir}/*.csv")
dataframes = []

for file in csv_files:
    try:
        data = pd.read_csv(file)
        if not data.empty:
            dataframes.append(data)
        else:
            print(f"File '{file}' is empty. Skipping...")
    except pd.errors.EmptyDataError:
        print(f"File '{file}' is empty or has formatting issues. Skipping...")

if dataframes:
    combined_data = pd.concat(dataframes, ignore_index=True)
    combined_data.to_csv("combined_data.csv", index=False)
    print("CSV files successfully combined and saved as 'combined_data.csv'")
else:
    print("No valid CSV files found or all files are empty.")

In [None]:
import numpy as np
import os
import csv
import pandas as pd
import glob

pi = math.pi
plt.rcParams["text.usetex"] = True

# Directory containing CSV files
out_dir = "/home/reza/ogs_HM"  # Update this with your actual directory

# Glob to find all CSV files
csv_files = glob.glob(f"{out_dir}/*.csv")

# Output file
output_file = "/home/reza/HM_ML/combined_data.csv"  # Save the output as a CSV file

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    print(f"Output directory '{output_dir}' does not exist. Please check the path.")
else:
    print(f"Output will be saved to '{output_file}'")

# Initialize the output file by writing the header of the first file
header_written = False

# Check if any CSV files were found
if not csv_files:
    print(f"No CSV files found in the directory '{out_dir}'. Please check the path.")
else:
    for file in csv_files:
        print(f"Processing file: {file}")
        try:
            # Reading the file in chunks
            for chunk in pd.read_csv(file, chunksize=10000):  # Adjust chunksize as needed
                # Write the header only for the first chunk of the first file
                if not header_written:
                    chunk.to_csv(output_file, index=False, mode='w', header=True)
                    header_written = True
                else:
                    chunk.to_csv(output_file, index=False, mode='a', header=False)
        except pd.errors.EmptyDataError:
            print(f"File '{file}' is empty or has formatting issues. Skipping...")

    print(f"CSV files successfully combined and saved as '{output_file}'")

Output will be saved to '/home/reza/HM_ML/combined_data.csv'
Processing file: /home/reza/ogs_HM/swelling_E1398279739.223539_nu0.2_K8e-13_p_b2000_S8000000.0.csv


In [1]:
import os
import pandas as pd
import glob
from concurrent.futures import ProcessPoolExecutor

# Directory containing CSV files
out_dir = "/home/reza/ogs_HM"  # Update this with your actual directory

# Glob to find all CSV files
csv_files = glob.glob(f"{out_dir}/*.csv")

# Output file
output_file = "/home/reza/HM_ML/combined_data.parquet"  # Save the output as a Parquet file

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    print(f"Output directory '{output_dir}' does not exist. Please check the path.")
else:
    print(f"Output will be saved to '{output_file}'")

# List to store the names of files with NaN values
files_with_nan = []

# Function to process each file
def process_file(file):
    print(f"Processing file: {file}")
    try:
        # Read the file in chunks
        for chunk in pd.read_csv(file, chunksize=10000):  # Adjust chunksize as needed
            # Check for NaN values in the specified columns
            if chunk.iloc[:, [0, 1, 2, 3, 4, 5, 8, 9]].isna().any().any() or chunk.iloc[:, [7, 10]].isna().any().any():
                files_with_nan.append(file)
                print(f"File '{file}' contains NaN values. Skipping...")
                return  # Skip the rest of the processing for this file

            # Round all numerical values to 3 decimal places
            chunk = chunk.round(decimals=3)

            # Append the processed chunk to the output parquet file
            chunk.to_parquet(output_file, index=False, engine='pyarrow', append=True)
    except pd.errors.EmptyDataError:
        print(f"File '{file}' is empty or has formatting issues. Skipping...")

# Check if any CSV files were found
if not csv_files:
    print(f"No CSV files found in the directory '{out_dir}'. Please check the path.")
else:
    # Process files in parallel
    with ProcessPoolExecutor() as executor:
        executor.map(process_file, csv_files)

    print(f"CSV files successfully combined and saved as '{output_file}' in Parquet format.")

    # Report files with NaN values
    if files_with_nan:
        print(f"\nNumber of files with NaN values: {len(files_with_nan)}")
        print("Files with NaN values:")
        for f in files_with_nan:
            print(f)
    else:
        print("\nNo files with NaN values were found.")


Output will be saved to '/home/reza/HM_ML/combined_data.parquet'
Processing file: /home/reza/ogs_HM/swelling_E1398279739.223539_nu0.2_K8e-13_p_b2000_S8000000.0.csvProcessing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8.840140773476137e-13_p_b2000_S8000000.0.csvProcessing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S11183994.486068025.csvProcessing file: /home/reza/ogs_HM/swelling_E2414228227.5026784_nu0.2_K8e-13_p_b2000_S8000000.0.csvProcessing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S11111819.631589593.csvProcessing file: /home/reza/ogs_HM/swelling_E1212602357.8720138_nu0.2_K8e-13_p_b2000_S8000000.0.csvProcessing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S6842960.482066397.csv

Processing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K7.592060660313023e-13_p_b2000_S8000000.0.csvProcessing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S4323208.325158917.csv
Processing f

In [3]:
# Path to the combined CSV file
file_path = "/home/reza/HM_ML/combined_data.csv"

# Initialize row count
row_count = 0

# Iterate over the file in chunks to count rows
for chunk in pd.read_csv(file_path, chunksize=10000):
    row_count += len(chunk)

print(f"Number of rows in the combined file: {row_count}")

# Read the first ten rows
first_ten_rows = pd.read_csv(file_path, nrows=10)

# Display the first ten rows
print("First ten rows of the combined file:")
print(first_ten_rows)

Number of rows in the combined file: 307920270
First ten rows of the combined file:
   time_value             E        nu             K   p_b   
0         0.0  1.000000e+09  0.199586  8.000000e-13  2000  \
1         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
2         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
3         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
4         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
5         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
6         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
7         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
8         0.0  1.000000e+09  0.199586  8.000000e-13  2000   
9         0.0  1.000000e+09  0.199586  8.000000e-13  2000   

   swelling_stress_rate  porosity  saturation     xs     ys  disp_magnitude   
0             8000000.0    0.0770    0.119444    0.0    0.0             0.0  \
1             8000000.0    0.0770    0.119444  240.0    0.0             0.0   
2      

In [1]:
import os
import pandas as pd
import glob
from concurrent.futures import ProcessPoolExecutor

# Directory containing CSV files
out_dir = "/home/reza/ogs_HM"

# Glob to find all CSV files
csv_files = glob.glob(f"{out_dir}/*.csv")

# Output file
output_file = "/home/reza/HM_ML/combined_data.parquet"

# Ensure the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    print(f"Output directory '{output_dir}' does not exist. Please check the path.")
else:
    print(f"Output will be saved to '{output_file}'")

# List to store the names of files with NaN values
files_with_nan = []

# List to store all the data chunks
dataframes = []

# Function to process each file
def process_file(file):
    print(f"Processing file: {file}")
    try:
        # Read the entire file (remove chunksize for now)
        df = pd.read_csv(file)
        
        # Check for NaN values in specific columns
        if df.iloc[:, [0, 1, 2, 3, 4, 5, 8, 9]].isna().any().any() or df.iloc[:, [7, 10]].isna().any().any():
            files_with_nan.append(file)
            print(f"File '{file}' contains NaN values. Skipping...")
            return  # Skip the rest of the processing for this file
        
        # Round all numerical values to 3 decimal places
        df = df.round(decimals=3)
        
        # Print to check if the dataframe is being created and has data
        print(f"Appending DataFrame with shape {df.shape} from file: {file}")

        # Append the valid dataframe to the list
        dataframes.append(df)
    except pd.errors.EmptyDataError:
        print(f"File '{file}' is empty or has formatting issues. Skipping...")

# Check if any CSV files were found
if not csv_files:
    print(f"No CSV files found in the directory '{out_dir}'. Please check the path.")
else:
    # Process files sequentially for now to avoid issues with parallel processing
    for file in csv_files:
        process_file(file)

    # Concatenate all dataframes and write to a Parquet file
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        combined_df.to_parquet(output_file, index=False, engine='pyarrow')
        print(f"CSV files successfully combined and saved as '{output_file}' in Parquet format.")
    else:
        print("No valid dataframes to save.")

    # Report files with NaN values
    if files_with_nan:
        print(f"\nNumber of files with NaN values: {len(files_with_nan)}")
        print("Files with NaN values:")
        for f in files_with_nan:
            print(f)
    else:
        print("\nNo files with NaN values were found.")


Output will be saved to '/home/reza/HM_ML/combined_data.parquet'
Processing file: /home/reza/ogs_HM/swelling_E1398279739.223539_nu0.2_K8e-13_p_b2000_S8000000.0.csv
Appending DataFrame with shape (2247453, 13) from file: /home/reza/ogs_HM/swelling_E1398279739.223539_nu0.2_K8e-13_p_b2000_S8000000.0.csv
Processing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S11183994.486068025.csv
Appending DataFrame with shape (2247453, 13) from file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8e-13_p_b2000_S11183994.486068025.csv
Processing file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8.840140773476137e-13_p_b2000_S8000000.0.csv
Appending DataFrame with shape (2247453, 13) from file: /home/reza/ogs_HM/swelling_E1000000000.0_nu0.2_K8.840140773476137e-13_p_b2000_S8000000.0.csv
Processing file: /home/reza/ogs_HM/swelling_E2414228227.5026784_nu0.2_K8e-13_p_b2000_S8000000.0.csv
Appending DataFrame with shape (2247453, 13) from file: /home/reza/ogs_HM/swelling_E2414228227