### Set up Variables to be used throughout
Variable Descriptions Below
- relativePath - Where your NYC Taxi FOIL 2013 Files are located. The Lakehouse path will mose likely start with Files/ and then you can define your path from there.
- lakehouseName - The name of the Lakehouse in your Workspace.
- outputTable - The name of the table that will be created after loading csvs

In [None]:
relativePath = ''
lakehouseName = ''
outputTable = ""

### Define a Function to Update Control Table
- The control table used in this process will follow the processing state of a file as it flows into the Lakehouse.
- This merge will happen in multiple parts of the process so I abstracted it into a function. 

In [None]:
def updateFileLog(updateFileName, updatePath,updateStatus):
    staticQuery = f"""
    SELECT '{updateFileName}' as fileName
            ,'{updatePath}' as relativePath
            ,'{updateStatus}' as  loadStatus
            ,to_utc_timestamp(current_date(),current_timezone()) as loadStartDate
            ,NULL as loadEndDate
    """

    spark.sql(staticQuery).createOrReplaceTempView("v_fileLog_inProcess")

    mergeQuery = f"""
    MERGE INTO control_filelog dest
    USING v_fileLog_inProcess src
    ON  dest.fileName = src.fileName
    AND dest.relativePath = src.relativePath
    WHEN MATCHED THEN
        UPDATE
        SET loadStatus = src.loadStatus,
            loadEndDate = to_utc_timestamp(current_date(),current_timezone())
    WHEN NOT MATCHED THEN 
        INSERT *
    """

    dfMergeResults = spark.sql(mergeQuery)

### Collect all the files in the directory into a List for Evaluating/Processing
- Reference mssparkutils [here](https://learn.microsoft.com/en-us/fabric/data-engineering/microsoft-spark-utilities) for a list of useful operations that can be performed.
- In this case I only want to evaluate files so I'm iterating through the list that is produced from the mssparkutils ls command and taking only the files.

In [None]:
listDirContents = mssparkutils.fs.ls(relativePath)
listFiles = []

for contents in listDirContents:
    if not contents.isDir:
        listFiles.append(contents.name)

### Load New Files Into Bronze Table

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import lit

for fileName in listFiles:
    # check for files in log 
    checkSQL = spark.sql(f"SELECT * FROM control_filelog WHERE fileName = '{fileName}' and relativePath = '{relativePath}'")
    ctFilesInLog = checkSQL.count()

    # if files not in log then process them
    if ctFilesInLog == 0:
        # 1 - Add to log
        updateFileLog(fileName,relativePath,'inprocess')

        try:
            # 2 - Load file to dataframe and remove spaces from column names
            inprocessDf = spark.read.option("header",True).csv(f"{relativePath}/{fileName}")
            inprocessDfCleanColNames = inprocessDf.select([F.col(col).alias(col.replace(' ', '_')) for col in inprocessDf.columns])
            inprocessDfAddFilename = inprocessDfCleanColNames.withColumn("filename",lit(fileName))

            # 3 - Determine if output table exists. If it does, merge data. If not, create table from first file being processed.
            checkTableExists = spark.catalog.tableExists(outputTable)

            if checkTableExists:
                inprocessDfAddFilename.write.mode("append").format("delta").save(f"Tables/{outputTable}")
            else:  
                inprocessDfAddFilename.write.mode("overwrite").format("delta").save(f"Tables/{outputTable}")

            #4 Update status and and enddate in control table and move files to appropriate directory
            updateFileLog(fileName,relativePath,'complete')
            mssparkutils.fs.mkdirs(f"{relativePath}/Archive/")
            mssparkutils.fs.mv(f"{relativePath}/{fileName}",f"{relativePath}/Archive/{fileName}")
        except:
            updateFileLog(fileName,relativePath,"error")
            mssparkutils.fs.mkdirs(f"{relativePath}/Error/")
            mssparkutils.fs.mv(f"{relativePath}/{fileName}",f"{relativePath}/Error/{fileName}")

### Pre-requisite control table DDL

In [None]:
# CREATE TABLE control_fileLog 
# (
#     fileName        string,
#     relativePath    string,
#     loadStatus      string,
#     loadStartDate   timestamp,
#     loadEndDate     timestamp
# )
# USING delta
