In [1]:
from datetime import date, datetime
from pyspark.sql import SparkSession
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from pyspark.sql.functions import to_utc_timestamp

import os
import json

In [2]:
path = os.getcwd()
initialLoadDate = datetime(2012, 12, 31, 0, 0, 0)
initialLoad = initialLoadDate.strftime("%Y-%m-%d %H:%M:%S")

In [3]:
newCutoff = ""
fromNotebook = True
source = ""
destination = ""
tables = ""
sparkMaster = "local[*]"
retriesMax = 2

In [None]:
jars = ""

if fromNotebook:
    f = open('load_wwi_bq.json',)
    config = json.load(f)
    f.close()

    newCutoff = config["cutoff_date"]
    jars = "../resources/jars/mssql-jdbc-13.2.0.jre11.jar,../resources/jars/spark-bigquery-with-dependencies_2.12-0.42.2.jar"
    source = config["source"]
    destination = config["destination"]
    tables = config["tables"]

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../resources/credentials/bigquery_token.json"
else:
    jars = "{0}/resources/jars/mssql-jdbc-13.2.0.jre11.jar,{0}/resources/jars/spark-bigquery-with-dependencies_2.12-0.42.2.jar".format(path)

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f"{path}/resources/credentials/bigquery_token.json"

print("jars", jars)
print("source", source)
print("desstination", destination)
print("tables", tables)

jars ../resources/jars/mssql-jdbc-13.2.0.jre11.jar,../resources/jars/spark-bigquery-with-dependencies_2.12-0.42.2.jar
source {'database': 'WideWorldImporters', 'url': 'jdbc:sqlserver://localhost\\MSSQLSERVER05;database=WideWorldImporters;user=sa;password=P@$$w0rd;encrypt=false'}
desstination {'database': 'dbt-tutorial-462014', 'schema': 'WideWorldImporters', 'url': ''}
tables [{'source': {'schema': 'Application', 'table': 'Cities', 'type': 'ValidDateRange', 'columns': [{'name': 'CityID', 'value': 'CityID'}, {'name': 'CityName', 'value': 'CityName'}, {'name': 'StateProvinceID', 'value': 'StateProvinceID'}, {'name': 'Location', 'value': 'Location'}, {'name': 'LatestRecordedPopulation', 'value': 'LatestRecordedPopulation'}, {'name': 'LastEditedBy', 'value': 'LastEditedBy'}, {'name': 'ValidFrom', 'value': 'CONVERT(NVARCHAR, ValidFrom, 20)'}, {'name': 'ValidTo', 'value': 'CONVERT(NVARCHAR, ValidTo, 20)'}]}, 'destination': {'schema': 'WideWorldImporters', 'table': 'Application_Cities', 'crea

In [5]:
def get_spark_session():
    if fromNotebook:
        return (
            SparkSession.builder 
                .config(
                    "spark.driver.host", 
                    "localhost"
                )
                .master(sparkMaster)
                .appName("load_wwi")
                .config("spark.jars", jars)    
                .getOrCreate()
        )
    else:
        return (
            SparkSession.builder 
                .master(sparkMaster)
                .appName("load_wwi")
                .config("spark.jars", jars)
                .config("spark.executor.memory", "4g")
                .config("spark.driver.memory", "4g")
                .config("spark.executor.cores", "6")
                .config("spark.cores.max", "6")
                .config("spark.network.timeout", "600s")
                .config("spark.executor.heartbeatInterval", "599s")
                .getOrCreate()
        )

In [6]:
def stop_spark_session():
    active_spark_session = SparkSession.getActiveSession()
    
    if active_spark_session:
        active_spark_session.stop()

In [7]:
bqClient = bigquery.Client(project=destination["database"])
bqClient

<google.cloud.bigquery.client.Client at 0x1bf00547ca0>

In [11]:
import pandas as pd

def get_max_loaddates():
    destDatabase = destination["database"]
    destSchema =  destination["schema"]
    loadHistoryTable = f"`{destDatabase}`.`{destSchema}`.`LoadHistory`"
    maxLoadHistoryTable = f"""
        SELECT 
            TableName,
            CAST(MAX(LoadDate) AS DATEtIME) AS `LoadDate`
        FROM 
            {loadHistoryTable}
        GROUP BY
            TableName
    """

    retries = 0
    isSuccessful = False

    while retries < retriesMax:

        retries = retries + 1

        try:
            
            spark = get_spark_session()

            # read 
            df = (
                spark
                    .read
                    .format("bigquery")
                    .option("query", maxLoadHistoryTable)
                    .option("viewsEnabled", "true")
                    .load()
            )

            retries = retriesMax

            isSuccessful = True

            return df
            
        except Exception as ex:
            stop_spark_session()

            if retries >= retriesMax:
                raise ex

    if isSuccessful == False:
        raise Exception("Unable to get max load dates")
    
dfMaxLoadDates = get_max_loaddates().toPandas()
dfMaxLoadDates["LoadDate"] = pd.to_datetime(dfMaxLoadDates["LoadDate"])

In [None]:
def get_source_db_table(sourceDatabase, sourceSchema, sourceTable, sourceType, lastCutoff, columns):
    sourceDbTable = ""
    columnsLst = []
    columnsStr = "*"
    
    if (len(columns)) > 0:
        for col in columns:
            name = col["name"]
            value = col["value"]
            columnsLst.append(f"{name} = {value}") 
        columnsStr = """,
        """.join(columnsLst)

    if sourceType == 'ValidDateRange':
        sourceDbTable = f"""
            (
                SELECT
                    {columnsStr},
                    [LoadDate] = '@NewCutoff'
                FROM 
                    @Database.@Schema.@Table 
                WHERE 
                    ValidFrom > '@LastCutoff' AND	
                    ValidFrom <= '@NewCutoff' 
            ) AS @Table
        """
    else:
        sourceDbTable = f"""
            (
                SELECT 
                    {columnsStr},
                    [LoadDate] = '@NewCutoff'
                FROM 
                    @Database.@Schema.@Table 
                WHERE 
                    LastEditedWhen > '@LastCutoff' AND
                    LastEditedWhen <= '@NewCutoff'
            ) AS @Table
        """

    sourceDbTable = sourceDbTable.replace(
        "@Database", 
        sourceDatabase
    ).replace(
        "@Schema", 
        sourceSchema
    ).replace(
        "@Table",
        sourceTable
    ).replace(
        "@LastCutoff",
        lastCutoff
    ).replace(
        "@NewCutoff",
        newCutoff
    ).replace(
        "@InitialLoad",
        initialLoad
    )

    return sourceDbTable

def load_wwi_to_wh(table, lastCutoff):
    sourceDatabase = source["database"]
    sourceSchema = table["source"]["schema"]
    sourceTable = table["source"]["table"]
    sourceType = table["source"]["type"]
    sourceDbTable = get_source_db_table(
        sourceDatabase,
        sourceSchema,
        sourceTable,
        sourceType,
        lastCutoff,
        table["source"]["columns"]
    )
    
    destDatabase = destination["database"]
    destSchema = table["destination"]["schema"]
    destTable = table["destination"]["table"]
    destDbTable = f"{destDatabase}.{destSchema}.{destTable}"

    retries = 0
    isSuccessful = False

    while retries < retriesMax:
        retries = retries + 1

        try:
            
            spark = get_spark_session()
            
            # create destination table
            create_table(table)

            # delete 
            if (retries - 1) > 0:
                delete_duplicates(table)

            # read 
            df = (
                spark.read
                    .format("jdbc")
                    .option("url", source["url"])
                    .option("dbtable", sourceDbTable)
                    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
                    .load()
            )

            # write
            (
                df.write 
                    .format("bigquery") 
                    .option("mode", "append") 
                    .option("writeMethod", "direct") 
                    .option("writeAtLeastOnce", "true")
                    .mode("append") 
                    .save(destDbTable)
            )

            retries = retriesMax

            isSuccessful = True

        except Exception as ex:
            stop_spark_session()

            if retries >= retriesMax:
                raise ex

    if isSuccessful == False:
        raise Exception("Unable to load table to warehouse")
    else:
        # save success load history
        insert_load_history(table, 'Successful')

def create_table(table):
    destDatabase = destination["database"]
    destSchema = table["destination"]["schema"]
    destTable = table["destination"]["table"]
    destDbTableFrom = f"`{destDatabase}`.`{destSchema}`.`{destTable}`"
    destDbTable = f"{destDatabase}.{destSchema}.{destTable}"
    
    retries = 0
    isSuccessful = False

    while retries < retriesMax:    
        
        retries = retries + 1
        
        try:
    
            try:
                bqClient.get_table(destDbTable)

                retries = retriesMax
                isSuccessful = True
            except NotFound:
                createTablePath = table["destination"]["createTable"]
                if fromNotebook == False:
                    createTablePath = f"{path}/notebooks/{createTablePath}"

                # create table if not exists
                createTableSql = ''
                with open(createTablePath, 'r', encoding='utf-8') as f:
                    createTableSql = f.read()

                query = createTableSql.replace("@DestinationTable", destDbTableFrom)
                createTableJob = bqClient.query(query)
                createTableJob.result()

                retries = retriesMax
                isSuccessful = True
            except Exception as ex:
                raise ex

        except Exception as ex:
            stop_spark_session()

            if retries >= retriesMax:
                raise ex

    if isSuccessful == False:
        raise Exception("Unable to create table") 

def delete_duplicates(table):
    destDatabase = destination["database"]
    destSchema = table["destination"]["schema"]
    destTable = table["destination"]["table"]
    destDbTableFrom = f"`{destDatabase}`.`{destSchema}`.`{destTable}`"
    destDbTable = f"{destDatabase}.{destSchema}.{destTable}"

    retries = 0
    isSuccessful = False

    while retries < retriesMax:    
        
        retries = retries + 1
        
        try:
    
            # check if table exists
            try:

                bqClient.get_table(destDbTable)

                deleteJob = bqClient.query(f"""
                    DELETE FROM 
                        {destDbTableFrom} 
                    WHERE 
                        CAST(LoadDate AS DATETIME) = CAST('{newCutoff}' AS DATETIME)
                """)
                
                deleteJob.result()

                retries = retriesMax
                isSuccessful = True

            except NotFound:
                retries = retriesMax
                isSuccessful = True
            except Exception as ex:
                raise ex

        except Exception as ex:
            stop_spark_session()

            if retries >= retriesMax:
                raise ex

    if isSuccessful == False:
        raise Exception("Unable to delete duplicates")
    
def insert_load_history(table, status):
    destDatabase = destination["database"]
    destSchema = table["destination"]["schema"]
    destTable = table["destination"]["table"]
    loadHistoryTableFrom = f"`{destDatabase}`.`{destSchema}`.`LoadHistory`"

    retries = 0
    isSuccessful = False

    while retries < retriesMax:
        
        retries = retries + 1
        
        try:
            
            insert_history_job = bqClient.query(f"""
                INSERT INTO {loadHistoryTableFrom}
                (
                    TableName,
                    LoadDate,
                    Status,
                    Details
                )
                VALUES 
                (
                    '{destTable}',
                    CAST('{newCutoff}' AS DATETIME),
                    'Sucessful',
                    NULL
                )
            """)
            insert_history_job.result()

            retries = retriesMax

            isSuccessful = True

        except Exception as ex:            
            stop_spark_session()

            if retries >= retriesMax:
                raise ex

    if isSuccessful == False:
        raise Exception("Unable to insert load history")

for table in tables:
    sourceTable = table["source"]["table"]
    destinationTable = table["destination"]["table"]
    print(f"processsing table {sourceTable} for {newCutoff}")

    lastCutoff = initialLoad
    df = dfMaxLoadDates[(dfMaxLoadDates["TableName"] == destinationTable)]
    if len(df) > 0:
        lastCutoff = dfMaxLoadDates.iloc[0, 1].strftime("%Y-%m-%d %H:%M:%S")

    if datetime.strptime(newCutoff, "%Y-%m-%d %H:%M:%S") > datetime.strptime(lastCutoff, "%Y-%m-%d %H:%M:%S"):
        load_wwi_to_wh(
            table,
            lastCutoff
        )
    else:
        print("already processed for the date")
        

processsing table Cities for 2014-01-01 00:00:00
processsing table Cities_Archive for 2014-01-01 00:00:00
processsing table Countries for 2014-01-01 00:00:00
processsing table Countries_Archive for 2014-01-01 00:00:00
processsing table DeliveryMethods for 2014-01-01 00:00:00
processsing table DeliveryMethods_Archive for 2014-01-01 00:00:00
processsing table PaymentMethods for 2014-01-01 00:00:00
processsing table PaymentMethods_Archive for 2014-01-01 00:00:00
processsing table People for 2014-01-01 00:00:00
processsing table People_Archive for 2014-01-01 00:00:00
processsing table StateProvinces for 2014-01-01 00:00:00
processsing table StateProvinces_Archive for 2014-01-01 00:00:00
processsing table TransactionTypes for 2014-01-01 00:00:00
processsing table TransactionTypes_Archive for 2014-01-01 00:00:00
processsing table PurchaseOrderLines for 2014-01-01 00:00:00
processsing table PurchaseOrders for 2014-01-01 00:00:00
processsing table SupplierCategories for 2014-01-01 00:00:00
pro