<div style="background-color: #222; padding: 24px;">
    <h1 style="color: #d4bbff; margin-bottom: 8px;">Query with Spark</h1>
    <h3 style="color: #fff; margin-top: 0;">Testing Lab.</h3>
</div>

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
import os
from dotenv import load_dotenv

# Load environment variables
env_path = os.path.join(os.getcwd(), '.env')
load_dotenv(dotenv_path=env_path)

# Define paths with validation
INPUT_DATA_PATH = "/home/jovyan/data/bronze"
OUTPUT_DATA_PATH = "/home/jovyan/data/gold"

# Verify directories exist and are accessible
def verify_directory(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
        os.chmod(path, 0o777)  # RWX for all
    if not os.access(path, os.R_OK | os.W_OK):
        raise PermissionError(f"Insufficient permissions for path: {path}")

try:
    verify_directory(INPUT_DATA_PATH)
    verify_directory(OUTPUT_DATA_PATH)
except Exception as e:
    print(f"Directory verification failed: {str(e)}")
    raise

# Configure Spark with enhanced settings
spark_master = os.getenv("SPARK_MASTER", "spark://spark-master:7077")  # Default fallback

conf = SparkConf() \
    .set("spark.hadoop.fs.permissions.umask-mode", "000") \
    .set("spark.sql.sources.ignoreNonExistentPaths", "true") \
    .set("spark.executor.extraJavaOptions", "-Djava.io.tmpdir=/tmp") \
    .set("spark.driver.extraJavaOptions", "-Djava.io.tmpdir=/tmp") \
    .set("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
    .set("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .set("spark.executor.memory", "2g") \
    .set("spark.driver.memory", "2g") \
    .set("spark.sql.catalogImplementation", "hive")

# Initialize Spark with error handling
try:
    spark = SparkSession.builder \
        .config(conf=conf) \
        .appName("DataProcessing") \
        .master(spark_master) \
        .enableHiveSupport() \
        .getOrCreate()
    
    # Verify Spark connectivity
    spark.sparkContext.setLogLevel("WARN")
    print(f"Spark session created successfully. Version: {spark.version}")

except Exception as e:
    print(f"Failed to initialize Spark session: {str(e)}")
    raise


Spark session created successfully. Version: 3.5.0


In [2]:
# Create SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
jdbc_url = "jdbc:postgresql://silver-postgres:5432/data_forge_silver"

connection_props = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

In [5]:
# Define your filters
filters = [
    {"item": "guisado", "loja": "RIGHI"}, 
    {"item": "tomate", "loja": "RIGHI"}
]

### Predicates

In [None]:
predicates = [
    f"LOWER(\"DS_ITEM\")LIKE LOWER('%{f['item']}%') AND  LOWER(\"STORE\") LIKE LOWER('%{f['loja']}%')"
    for f in filters
]

df_filtered_predicates = spark.read.jdbc(
    url=jdbc_url,
    table="purchases",
    predicates=predicates,
    properties=connection_props
)

df_filtered_predicates.show()

### Database filtering

In [None]:
# Generate WHERE clauses for each filter
conditions = [
    f"(LOWER(\"DS_ITEM\") LIKE LOWER('%{f['item']}%') AND LOWER(\"STORE\") LIKE LOWER('%{f['loja']}%'))"
    for f in filters
]

# Combine with OR for all filters
where_clause = " OR ".join(conditions)

# Push the filter to PostgreSQL via JDBC
query = f"(SELECT * FROM purchases WHERE {where_clause}) AS filtered_purchases"

df_filtered = spark.read.jdbc(
    url=jdbc_url,
    table=query,
    properties=connection_props
)

df_filtered.show()

In [None]:
#myresult.explain(True)