In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": "fa2073f0-882e-4087-baec-64e8e7d3d33a",
          "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope="kradls",key="ADLSAppKey"),
          "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/a3000236-0406-45f9-b95f-d7f32bc39886/oauth2/token"}

# Optionally, you can add <directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = "abfss://development@teststorage19923.dfs.core.windows.net/",
  mount_point = "/mnt/Sales",
  extra_configs = configs)

In [0]:
dbutils.fs.ls("/mnt/Sales/sap/data-files/")

In [0]:
df_pers = spark.read.csv("/mnt/Sales/sap/data-files/Personal_data_sap.txt", sep='\t',header=True,inferSchema=True)

In [0]:
df_pers.show(5)

In [0]:
df_pers.printSchema()

In [0]:
#ADD SERIAL NUMBER
from pyspark.sql import Window

#import pyspark.sql.functions as F

from pyspark.sql import functions as F

window = Window.orderBy(F.col('P0000-PENR'))

df_pers = df_pers.withColumn('SerialNumber', F.row_number().over(window))

df_pers.show(5)

In [0]:
#RENAME COLUMNS
from functools import reduce

oldColumns = df_pers.columns[:-1]

newColumns = ["PersNo", "EmployeeName", "GenderKey", "Gender", "DOB"]

df_pers = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df_pers)
df_pers.printSchema()
df_pers.show()

In [0]:
#ADD NECESSARY COLUMNS

df_pers = df_pers.withColumn("StartDate",F.lit(F.current_timestamp())).withColumn("EndDate",F.to_timestamp(F.lit("31.12.9999"), "dd.MM.yyyy")).withColumn("CreatedDate",F.lit(F.current_timestamp()))
df_pers.show()

In [0]:
#ADD ISActive Column
df_pers = df_pers.withColumn("IsActive",F.when((df_pers.EndDate >= F.current_timestamp()) & (df_pers.StartDate <= F.current_timestamp()), F.lit(True))
                  .otherwise(F.lit(False))
                  )
df_pers.show()

In [0]:
#Convert DataType of DOB

df_pers = df_pers.withColumn("DOB", F.to_timestamp(df_pers.DOB, "dd.MM.yyyy"))
df_pers.show()
df_pers.printSchema()

In [0]:
#Add BatchID
import random
def generate_batch_id():
    return random.randint(0, 1000)

df_pers = df_pers.withColumn("BatchID", F.lit(generate_batch_id()))
df_pers.show()

In [0]:
#ADD HASH COLUMNS
cols_with_allTS = df_pers.columns
cols_with_SD = [x for x in cols_with_allTS if x != "EndDate"]

#df_pers.withColumn("HashAllTS", F.sha2(F.concat_ws("\t", *cols), 256)).show()

df_pers = df_pers.withColumn("HashAllTS", F.md5(F.concat_ws("\t", *cols_with_allTS))).withColumn("HashWithStartDate", F.md5(F.concat_ws("\t", *cols_with_SD)))
df_pers.show()


In [0]:
jdbcHostname = "testserver19923.database.windows.net"
jdbcDatabase = "testDB"
jdbcPort = 1433
jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
"user" : "azureuser",
"password" : "Xmb#529Geq12345",
"driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

In [0]:
df_gender = spark.read.jdbc(url=jdbcUrl, table="[dbo].[Gender_M]", properties=connectionProperties)
df_gender.show()

In [0]:
joined_df = df_pers.join(df_gender, df_pers.GenderKey==df_gender.GenderCode, "left")
#joined_df.show(truncate=False)
joined_df.printSchema()

In [0]:
#SELECT ONLY required Columns
req_cols = ["PersNo","EmployeeName","DOB","GenderMasterID","IsActive","StartDate","EndDate","CreatedDate","BatchID"]
final_df = joined_df.select(*req_cols)

In [0]:
final_df.show()

In [0]:
final_df.write.jdbc(url=jdbcUrl, table="[dbo].[PersonalData]", mode = "append",properties=connectionProperties)