# Environment variables

In [1]:
#  Environment parameters 
env = 'td'

# Import libraries

In [2]:
import pytz
import time
import pyodbc
import pandas as pd
from datetime import *
from delta.tables import *

# Functions for the program

In [3]:
# Function to load active records of delta table to dataframe
def load_delta_data_in_temp_views(source, source_container, storageaccount, table_name):
    print(f"loading the dataframe to create the temp view")
    source_path = f"abfss://{source_container}@{storageaccount}.dfs.core.windows.net/{source}/data/{table_name}"
    df = spark.read.load(path=source_path,format="delta")
    df = df.filter(df.IsActive_Record == 'Y')
    df.createOrReplaceTempView(f"{table_name}")
    print(f"temporary view created for the table {table_name}")

In [4]:
# Function to write the output of spark sql to delta table in curated layer
def write_to_delta_table(df, source, destination_container, destination_table_name, storageaccount):
    print(f"-------------------------")
    print(f"Writing the data to delta table")
    destination_path = f"abfss://{destination_container}@{storageaccount}.dfs.core.windows.net/{source}/data/{destination_table_name}"
    df.write.option('path', destination_path).mode("overwrite").format("delta").saveAsTable(destination_table_name)
    print(f"Delta table created: {destination_table_name}")

# Special blocks for temporary purpose

In [5]:
path_st_mandant = f"abfss://enriched@storageaccounttest.dfs.core.windows.net/<source_system>/data/st_mandant"
df_st_mandant = spark.read.format("delta").option("header", "true").load(path_st_mandant) 
##df_st_mandant = df_st_mandant.filter(df_st_mandant.IsActive_Record == 'Y')
df_st_mandant.createOrReplaceTempView("st_mandant")

In [6]:
path_st_wechselkurs_akt = f"abfss://enriched@storageaccounttest.dfs.core.windows.net/<source_system>/data/st_wechselkurs_akt"
df_st_wechselkurs_akt = spark.read.format("delta").option("header", "true").load(path_st_wechselkurs_akt) 
##df_st_wechselkurs_akt = df_st_wechselkurs_akt.filter(df_st_wechselkurs_akt.IsActive_Record == 'Y')
df_st_wechselkurs_akt.createOrReplaceTempView("st_wechselkurs_akt")

# Run process

## Other variables assignment

In [7]:
if env == 'td':
    storageaccount = 'storageaccounttest'
else:
    storageaccount = 'storageaccountprod'

# Source
source = '<source_system>'
source_container = 'enriched'

# destination
destination_container = 'curated'
destination_table_name = '<table_name>'

# Metadata path - SAP Table DD03L
meta_path = f"abfss://{source_container}@{storageaccount}.dfs.core.windows.net/{source}/data/dd03l"

# Keyl vault
key_vault_name = 'KV-'+env.upper()+'WEUGDWHDATAHUB'
ls_key_vault_name = 'ls_AzureKeyVault'

# Metadata DB
meta_server = 'tcp:sql-'+env+'-weu-GDWH-datahub.database.windows.net'
meta_database = 'SQLDB-METADATASTOREDB'
meta_username = 'sqlserveradmin'

# get Metadata DB password
meta_password = mssparkutils.credentials.getSecret(key_vault_name,meta_username,ls_key_vault_name) 

# Set timezone
timezone = pytz.timezone('Europe/Amsterdam')

## Connection to metadata table

In [8]:
# Connect to metadata DB
driver = '{ODBC Driver 17 for SQL Server}'
cnxn = pyodbc.connect('DRIVER='+driver+';SERVER='+meta_server+';PORT=1433;DATABASE='+meta_database+';UID='+meta_username+';PWD='+ meta_password)

# Read metadata for tables using:
# 1) SAP Extraction Table 

# Logging start
print(f"------------LOGGING START-------------")
print(f"Connecting to Azure SQL metadata table")

tables = pd.read_sql(
    '''SELECT 
	[TABLE_ID] AS [ID]
	,[TABLE_NAME]
	,[SOURCE_SYSTEM]
	,[EXTRACTION_TYPE]
	,[IS_ACTIVE]
	,'SAP TABLE' AS [CONNECTOR_TYPE]
FROM [MDL].[EXTRACTION_SAPTABLE]
WHERE [IS_ACTIVE] = '1'
AND TABLE_NAME in ('EKET','EKKO','EKPO','EKES')
	'''
    ,cnxn
)

distinct_tables = tables['TABLE_NAME'].unique()

## Load data in temporary views

In [9]:
# Set timezone
timezone = pytz.timezone('Europe/Amsterdam')

# Loop over tables
for table in distinct_tables:
    table_name = table.lower()

    # Set start Timestamp
    ts_start = datetime.now(tz = timezone)

    # Logging
    print(f"-------------------------")
    print(f"Source table: {table_name} from source system {source} is started at {ts_start}")
    
    # execute the function
    load_delta_data_in_temp_views(source, source_container, storageaccount, table_name)

    # Reset end Timestamp
    ts_end = datetime.now(tz = timezone)

    # Logging end
    print(f"Source table: {table_name} is finished at {ts_end}")

# Logging end
print(f"-------------------------")

## Block to modularize code in notebooks

In [10]:
# referencing a notebook is giving an error in the synapse pipeline. 
# %run Delta Lakehouse/Custom package/Spark SQL/nb_SparkSQL_for_FACT_110_ORDER_BALANCE_AND_115_INCOMING_ORDERS

In [None]:
print(f"-------------------------")
print(f"Starting the custom Spark SQL")
df = spark.sql(
"""SELECT * FROM TEMP.VIEW
""")
print(f"The custom Spark SQL successfully finised")
print(f"-------------------------")


## Write data to delta tables

In [11]:
#Global Unmanaged/External Table
write_to_delta_table(df, source, destination_container, destination_table_name, storageaccount)
print(f"-------------------------")
print(f"------------LOGGING END-------------")
