In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import *
from functools import reduce  
from pyspark.sql import DataFrame
from pyspark.sql.window import Window

In [0]:
dataSchema = StructType([
    StructField("employee_id", StringType(), True),
    StructField("task_id", StringType(), True),
    StructField("completed", StringType(), True)
])

In [0]:
# function to Create DataFrame from all the files

def extract_df(element):
    base_data = spark.read \
       .option("header", True) \
       .schema(dataSchema) \
       .csv(element.path)
    
    # print(base_data)
    return base_data

In [0]:
# load the data by calling the function

data_loc_arr = dbutils.fs.ls("/mnt/rupamemailautomation/employeesdata")
list_df = list(map(extract_df, data_loc_arr))

In [0]:
# display the data
# [display(df) for df in list_df]

In [0]:
# make a single df from all the available dfs
df = reduce(DataFrame.union, list_df)

In [0]:
# display(df)

In [0]:
df1 = df.withColumn("lower_completed", lower(df.completed)).drop(df.completed)

In [0]:
# display(df1)

In [0]:
sqldburl = dbutils.secrets.get(scope = "databricks-scope", key = "databricks-app-sqldb-url")
user = dbutils.secrets.get(scope = "databricks-scope", key = "databricks-scope-sqldb-user")
password = dbutils.secrets.get(scope = "databricks-scope", key = "databricks-scope-sqldb-password")

In [0]:
# Configure the JDBC connection properties
jdbc_url = sqldburl
table_name = "task_details"
properties = {
    "user": user,
    "password": password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

In [0]:
task_details_df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=properties)

In [0]:
task_details_df.cache()

In [0]:
task_details_df.printSchema()

In [0]:
updated_df = task_details_df.join(df1, ["employee_id", "task_id"], "left_outer")

In [0]:
display(updated_df)

In [0]:
updated_df2 = updated_df.withColumn("completed", when(col('lower_completed') == 'yes', True).otherwise(col('completed')))

In [0]:
display(updated_df2)

In [0]:
updated_df3 = updated_df2.drop(col('lower_completed'))

In [0]:
display(updated_df3)

In [0]:
from pyspark.sql.functions import col

In [0]:
c = updated_df3.filter(col('completed') == True).count()
print(c)

In [0]:
updated_df3_copy = updated_df3

In [0]:
# Check if the table exists and has data
updated_df3.write.jdbc(url=jdbc_url, table=table_name, mode="overwrite", properties=properties)


In [0]:
test_df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=properties)

In [0]:
display(test_df)

In [0]:
test_df_completed = test_df.filter(test_df.completed == True)

In [0]:
display(test_df_completed)

In [0]:
dbutils.notebook.exit("success")