# Data Collector

In [74]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, to_date


In [3]:
def spark_session():

    # Define the HDFS configuration
    hdfs_uri = "hdfs://localhost:9000/"

    # Create the Spark session
    spark = SparkSession \
        .builder \
        .appName("hdfs_query") \
        .config("spark.hadoop.fs.defaultFS", hdfs_uri) \
        .getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")

    return spark

spark = spark_session()

In [75]:
def get_spark_df(spark_session, query=None, file_path=None):

    # Ensure that whether query or file_path is provided
    if query is None and file_path is None:
        raise ValueError("Either 'query' or 'file_path' must be provided.")
    if query is not None and file_path is not None:
        raise ValueError("Only one of 'query' or 'file_path' should be provided.")

    # If a query was provided, return a df with its result
    if query:
        df = spark.sql(query.format(query))

    # If a file_path was provided, return a df with the content of that file
    elif file_path:
        df = spark_session.read.parquet(file_path)

    return df

In [107]:
path = '/thesis/peru/exports/*.parquet'
df = get_spark_df(spark_session=spark,file_path=path)
df = df.withColumn("date", to_date(col("BATCH_WEEK").substr(3,6), "ddMMyy"))
max_date = df.select(max("date")).first()[0]
print("Most recent date:", max_date)

Most recent date: 2023-03-12
