### Connection to MinIO/S3

In [None]:
S3 = 'http://server:9000'
SPARK_MASTER = "spark://server:7077"

In [None]:
import os
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Quantum Pipeline Feature Processing')
    .master(SPARK_MASTER)
    .config('spark.hadoop.fs.s3a.endpoint', S3)
    .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    .config('spark.hadoop.fs.s3a.access.key', os.environ.get('MINIO_ACCESS_KEY'))
    .config('spark.hadoop.fs.s3a.secret.key', os.environ.get('MINIO_SECRET_KEY'))
    .config('spark.hadoop.fs.s3a.path.style.access', 'true')
    .getOrCreate()
)

## Read the DataFrame

In [None]:
S3_BUCKET = 's3a://local-vqe-results/experiments/'

In [None]:
from py4j.java_gateway import java_import
from pyspark.sql import SparkSession

java_import(spark._jvm, "org.apache.hadoop.fs.FileSystem")
java_import(spark._jvm, "org.apache.hadoop.fs.Path")
java_import(spark._jvm, "org.apache.hadoop.conf.Configuration")

def list_available_topics():
    """List available topic names under experiments/"""

    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
        spark._jvm.java.net.URI.create(S3_BUCKET), spark._jsc.hadoopConfiguration()
    )

    path = spark._jvm.org.apache.hadoop.fs.Path(S3_BUCKET)
    
    if fs.exists(path) and fs.isDirectory(path):
        return [
            f.getPath().getName()
            for f in fs.listStatus(path) if f.isDirectory()
        ]
    return []

In [None]:
available_topics = list_available_topics()
print("Available Topics:", available_topics)

In [None]:
import requests

def read_avro_by_topic(topic_name):
    """Read Avro files from a specific topic's directory"""
    topic_base_path = f"{S3_BUCKET}{topic_name}/"
    topic_path = f"{topic_base_path}partition=*/"
    df = spark.read.format("avro").load(topic_path)
    return df

In [None]:
df = read_avro_by_topic(available_topics[0])
df.show(1)

In [None]:
from pyspark.sql.functions import col, explode, size, expr, lit, monotonically_increasing_id

base_df = df.select(
    col("molecule_id"),
    col("basis_set"),
    col("vqe_result.initial_data").alias("initial_data"),
    col("vqe_result.iteration_list").alias("iteration_list"),
    col("vqe_result.minimum").alias("minimum_energy"),
    col("vqe_result.optimal_parameters").alias("optimal_parameters"),
    col("vqe_result.maxcv").alias("maxcv"),
    col("vqe_result.minimization_time").alias("minimization_time"),
    col("hamiltonian_time"),
    col("mapping_time"),
    col("vqe_time"),
    col("total_time"),
    col("molecule.molecule_data").alias("molecule_data")
)

df_molecule = base_df.select(
    col("molecule_id"),
    col("molecule_data.symbols").alias("atom_symbols"),
    col("molecule_data.coords").alias("coordinates"),
    col("molecule_data.multiplicity").alias("multiplicity"),
    col("molecule_data.charge").alias("charge"),
    col("molecule_data.units").alias("coordinate_units"),
    col("molecule_data.masses").alias("atomic_masses")
)

df_ansatz = base_df.select(
    col("molecule_id"),
    col("initial_data.ansatz").alias("ansatz"),
    col("initial_data.ansatz_reps").alias("ansatz_reps")
)

df_metrics = base_df.select(
    col("molecule_id"),
    col("basis_set"),
    col("hamiltonian_time"),
    col("mapping_time"),
    col("vqe_time"),
    col("total_time"),
    col("minimization_time"),
    (col("hamiltonian_time") + col("mapping_time") + col("vqe_time")).alias("computed_total_time")
)

df_vqe = base_df.select(
    col("molecule_id"),
    col("basis_set"),
    col("initial_data.backend").alias("backend"),
    col("initial_data.num_qubits").alias("num_qubits"),
    col("initial_data.optimizer").alias("optimizer"),
    col("initial_data.noise_backend").alias("noise_backend"),
    col("initial_data.default_shots").alias("default_shots"),
    col("initial_data.ansatz_reps").alias("ansatz_reps"),
    col("minimum_energy"),
    col("maxcv"),
    size(col("iteration_list")).alias("total_iterations")
)

df_initial_parameters = base_df.select(
    col("molecule_id"),
    col("initial_data.backend").alias("backend"),
    col("initial_data.num_qubits").alias("num_qubits"),
    explode(col("initial_data.initial_parameters")).alias("initial_parameter_value")
)

df_initial_parameters = df_initial_parameters.withColumn(
    "parameter_index", 
    expr("row_number() over (partition by molecule_id, backend, num_qubits order by 1) - 1")
)

df_optimal_parameters = base_df.select(
    col("molecule_id"),
    col("initial_data.backend").alias("backend"),
    col("initial_data.num_qubits").alias("num_qubits"),
    explode(col("optimal_parameters")).alias("optimal_parameter_value")
)

df_optimal_parameters = df_optimal_parameters.withColumn(
    "parameter_index", 
    expr("row_number() over (partition by molecule_id, backend, num_qubits order by 1) - 1")
)

df_iterations = base_df.select(
    col("molecule_id"),
    col("initial_data.backend").alias("backend"),
    col("initial_data.num_qubits").alias("num_qubits"),
    explode(col("iteration_list")).alias("iteration")
).select(
    col("molecule_id"),
    col("backend"),
    col("num_qubits"),
    col("iteration.iteration").alias("iteration_step"),
    col("iteration.result").alias("iteration_energy"),
    col("iteration.std").alias("energy_std_dev")
)

df_iteration_parameters = base_df.select(
    col("molecule_id"),
    col("initial_data.backend").alias("backend"),
    col("initial_data.num_qubits").alias("num_qubits"),
    explode(col("iteration_list")).alias("iteration")
).select(
    col("molecule_id"),
    col("backend"),
    col("num_qubits"),
    col("iteration.iteration").alias("iteration_step"),
    explode(col("iteration.parameters")).alias("parameter_value")
)

df_iteration_parameters = df_iteration_parameters.withColumn(
    "parameter_index", 
    expr("row_number() over (partition by molecule_id, backend, num_qubits, iteration_step order by 1) - 1")
)

df_hamiltonian = base_df.select(
    col("molecule_id"),
    col("initial_data.backend").alias("backend"),
    explode(col("initial_data.hamiltonian")).alias("hamiltonian_term")
).select(
    col("molecule_id"),
    col("backend"),
    col("hamiltonian_term.label").alias("term_label"),
    col("hamiltonian_term.coefficients.real").alias("coeff_real"),
    col("hamiltonian_term.coefficients.imaginary").alias("coeff_imag")
)

df_hamiltonian = df_hamiltonian.withColumn(
    "term_index", 
    expr("row_number() over (partition by molecule_id, backend order by term_label) - 1")
)

In [None]:
import time
spark.sparkContext.setLogLevel("INFO")
test_df = spark.range(1000).repartition(10)
start_time = time.time()
count = test_df.count()
duration = time.time() - start_time
print(f"Count: {count}, Duration: {duration:.2f} seconds")

In [None]:
df_molecule.show(3, truncate=False)
df_metrics.show(3, truncate=False)
df_iterations.show(3, truncate=False)
df_iteration_parameters.show(3, truncate=False)
df_hamiltonian.show(3, truncate=False)

df_vqe.show(3, truncate=False)
df_initial_parameters.show(3, truncate=False)
df_optimal_parameters.show(3, truncate=False)