### Start the kernel specific to SPARK

* conda env list
* conda activate spark
* in visual code IDE, select the environment as python-spark
* check that in the selected environment pyspark is installed

In [1]:
!pip3 show pyspark
!pip3 show findspark

[0m

In [None]:
# The following line help the Jupyter program to find the Spark binaries to run the job
import findspark
findspark.init()

In [None]:
# Sample program to validate pySpark library is available
import pyspark
sc = pyspark.SparkContext('local[*]')

txt = sc.textFile('file:////Users/sxxx/github/spark-scala/README.md')
print(txt.count())

python_lines = txt.filter(lambda line: 'python' in line.lower())
print(python_lines.count())


#big_list = range(10000)
#>>> rdd = sc.parallelize(big_list, 2)
#>>> odds = rdd.filter(lambda x: x % 2 != 0)
#>>> odds.take(5)

In [None]:
# Another random code snippet to check if the Spark session is still alive after the previous cell execution
big_list = range(10000)
rdd = sc.parallelize(big_list, 2)
odds = rdd.filter(lambda x: x % 2 != 0)
odds.take(5)

In [None]:
# Load the CCM properties file
from os.path import expanduser
home = expanduser("~")

separator = "="
keys = {}

# I named your file conf and stored it 
# in the same directory as the script

with open(home+'/nexus.prop') as f:

    for line in f:
        if separator in line:

            # Find the name and value by splitting the string
            name, value = line.split(separator, 1)

            # Assign key value pair to dict
            # strip() removes white space from the ends of strings
            keys[name.strip()] = value.strip()

#print(keys)

In [None]:
#This section handles adding JDBC driver to the PYSPARK shell
import os
jdbc_connector_mysql=keys["jdbc-connector-mysql"]

os.environ["PYSPARK_SUBMIT_ARGS"] = f"--jars file://{jdbc_connector_mysql} pyspark-shell"


In [None]:
#Create sparksession instance

spark.stop()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# Fetch 1 workflow records from the templates table
#database = 'reach'
#url='jdbc:mysql://{}/{}'.format(host, database)
#table = 'reach.tasks'

table = "(SELECT workflowId, REPLACE(CONVERT(REPLACE(workflow,'&amp;','&') using utf8),'','') as workflow FROM reach.templates LIMIT 3) AS t"

url = keys["reach-dev-url"]
#query='select taskid from tasks limit 10'
user = keys["reach-dev-userid"]
password = keys["reach-dev-password"]

properties = {
    'user': user,
    'password': password,
    'driver': 'com.mysql.jdbc.Driver',
    #'query': 'select taskid from tasks limit 10',
    'fetchsize': '10'
}

df = spark.read.jdbc(url, table, properties=properties)
#df = sqlContext.read.jdbc(url, query, properties=properties)
df.show()

In [None]:
#Parse the json column
df.select("workflow").show()

In [None]:
#from pyspark.sql.functions import from_json, col
#json_schema = spark.read.json(df.select("workflow").rdd.map(lambda row: row.json)).schema
#df.withColumn('json', from_json(col('json'), json_schema))

workflowRDD = df.select("workflow").rdd

workflowRDD.map(lambda r: r.).take(2)
#workflowRDD.map(lambda r: r.toJSON).take(2)
#workflowRDD.take(2)

#new_df = spark.read.json(.map(lambda r: r.json))
#new_df.printSchema()

#df.show()

In [None]:
workflowRDD2 = df.select("workflow").toJSON()


In [None]:
workflowRDD2.map(lambda r: type(r)).take(2)

In [None]:
workflowRDD2.map(lambda r: r[1:30]).take(2)

In [None]:
json_schema = spark.read.json(workflowRDD2).schema

In [None]:
print(json_schema.fields)

In [None]:
from pyspark.sql.functions import from_json, col
df.withColumn('json', from_json(col('workflow'), json_schema)).printSchema

In [None]:
workflowRDD3 = df.select("workflow").rdd.map(list)
workflowRDD3.cache()

In [None]:
workflowRDD3.map(lambda r: type(r)).take(2)

In [None]:
workflowRDD4 = workflowRDD3.map(lambda r: r[0])

In [None]:
workflowRDD4.map(lambda r: r[1:30]).take(2)

In [None]:
json_schema2 = spark.read.json(workflowRDD4).schema

In [None]:
print(json_schema2.fields)

In [None]:
from pyspark.sql.functions import from_json, col
df.withColumn('json', from_json(col('workflow'), json_schema2)).printSchema

In [None]:
new_df=df.withColumn('json', from_json(col('workflow'), json_schema2))
new_df.cache()

In [None]:
from pyspark.sql.functions import explode
new_df.select("json.id", "json.description", explode("json.startADGroups").alias("start_adgroup")).show(truncate=False)

In [None]:
new_df2 = new_df.select(col("json.id").alias("id"), explode("json.sections").alias("section")).select("id", explode("section.fields").alias("field")).select(col("field.id").alias("fieldId")).distinct().limit(5)

In [None]:
new_df2.show(10)

In [None]:
from pyspark.sql.functions import lit
new_df3 = new_df2.select("fieldId", lit(1).alias("id"))

In [None]:
new_df3.show()

In [None]:
from pyspark.sql.functions import lit
pivotDF = new_df3.groupBy("id").pivot("fieldId").min("id")

In [None]:
pivotDF.show()

In [None]:
#new_df3.join(pivotDF,on="id", how="inner").show()
cross_df = new_df3.crossJoin(pivotDF)

In [None]:
cross_df.show()

In [None]:
column_list = cross_df.drop("fieldId", "id").columns

In [None]:
print(column_list)

In [None]:
pre_fin_df = cross_df.drop("id").select(*[lit(column).alias(column) if column in column_list else column for column in cross_df.drop("id").columns])

In [None]:
pre_fin_df.show()

In [None]:
fin_df = pre_fin_df.drop("fieldId").distinct()

In [None]:
fin_df.show()

In [None]:
# Load data from templates table for workflowid=a611477c-41f1-4a9e-9721-d7afeaf55099
#REPLACE(CONVERT(REPLACE(workflow,'&amp;','&') using utf8),'','') as workflow
table = "(SELECT workflowId, workflowVersion, publishStatus, isActive, lastUpdatedTs, lastPublishedTs, REPLACE(CONVERT(REPLACE(lastUpdatedBy,'&amp;','&') using utf8),'','') as lastUpdatedBy, REPLACE(CONVERT(REPLACE(lastPublishedBy,'&amp;','&') using utf8),'','') as lastPublishedBy, REPLACE(CONVERT(REPLACE(createdBy,'&amp;','&') using utf8),'','') as createdBy, createdTs, name, description, REPLACE(CONVERT(REPLACE(workflow,'&amp;','&') using utf8),'','') as workflow FROM reach.templates where workflowid='a611477c-41f1-4a9e-9721-d7afeaf55099' and workflowversion < 11) AS t"

#table = "(SELECT workflowid, name, count(1) as cnt from reach.templates group by workflowid, name) AS t"

url = keys["reach-dev-url"]
user = keys["reach-dev-userid"]
password = keys["reach-dev-password"]

properties = {
    'user': user,
    'password': password,
    'driver': 'com.mysql.jdbc.Driver',
    'fetchsize': '10'
}

templatesDF = spark.read.jdbc(url, table, properties=properties)
templatesDF.cache()

In [None]:
templatesDF.select("workflowid", "workflowversion", "publishstatus", "isactive", "lastupdatedts", "lastpublishedts", "name", "description").show(truncate=False)

In [None]:
#Flatten out the workflowMetadata information from templates table
#templatesFlatten1 = templatesDF.select("workflowid", "workflowversion", "workflow")
templatesFlatten1 = templatesDF.select("workflow")
templatesFlatten2 = templatesFlatten1.rdd.map(list)
templatesFlatten2.cache()

In [None]:
templatesFlatten3 = templatesFlatten2.map(lambda r : r[0])
#templatesFlatten3.map(lambda r : type(r)).take(2)
workflow_schema = spark.read.json(templatesFlatten3).schema
print(workflow_schema.fields)

In [None]:
from pyspark.sql.functions import from_json, col, explode
templateParsed = templatesFlatten1.withColumn('json', from_json(col('workflow'), workflow_schema))
templateParsed.printSchema

In [None]:
###****************DEBUG****************###
#templateParsed.count()
templateParsed.select(col("json.id").alias("workflowid"), col("json.version").alias("workflowVersion"), explode("json.sections").alias("section")).select("workflowid", "workflowversion", col("section.id").alias("sectionid")).filter(col("workflowversion")==10).show(truncate=False)

In [None]:
###****************DEBUG****************###
from pyspark.sql.functions import collect_set, sort_array
templateSectionsDF = templateParsed.select(col("json.id").alias("workflowid"), col("json.version").alias("workflowVersion"), col("json.sections").alias("sections"))

templateSectionExplodedDF = templateSectionsDF.select("workflowid", "workflowVersion", explode("sections").alias("section")).select("workflowid", "workflowversion", col("section.id").alias("sectionid"), col("section.description").alias("sectiondesc"), col("section.order").alias("sectionorder"), col("section.title").alias("sectiontitle"), col("section.type").alias("sectiontype"), col("section.fields").alias("fields"))

templateSectionExplodedDF.groupBy(col("workflowid"), col("sectionid")).agg(sort_array(collect_set(col("workflowversion")))).show(truncate=False)

In [None]:
###****************DEBUG****************###
templateSectionExplodedDF.select("workflowid", "workflowversion", "sectionid", "sectionorder", "sectiontitle", "sectiontype").filter(col("workflowversion").isin({1, 2})).show(truncate=False)

In [None]:
###****************DEBUG****************###
from pyspark.sql.functions import substring
templateFieldExplodedDF = templateSectionExplodedDF.select("workflowid", "workflowversion", "sectionid", "sectionorder", "sectiontitle", "sectiontype", explode("fields").alias("field")).select("workflowid", "workflowversion", "sectionid", "sectionorder", "sectiontitle", "sectiontype", col("field.id").alias("fieldid"), col("field.helpertext").alias("fieldhelpertext"), col("field.hidden").alias("fieldishidden"), col("field.includeyear").alias("fieldincludeyear"), col("field.isfilterable").alias("fieldisfilterable"), col("field.label").alias("fieldlabel"), col("field.options").alias("fieldoptions"), col("field.order").alias("fieldorder"), col("field.placeholder").alias("fieldplaceholder"), col("field.prefix").alias("fieldprefix"), col("field.required").alias("fieldidrequired"), col("field.responses").alias("fieldresponses"), col("field.type").alias("fieldtype"))

#templateFieldExplodedDF.select(col("workflowversion").alias("wid"), substring("sectionid", 0, 3).alias("sid"), col("sectionorder").alias("sord"), substring("sectiontitle",0, 20).alias("sttl"), col("sectiontype").alias("styp"), substring("fieldid", 0, 10).alias("fid"), substring("fieldhelpertext", 0, 15).alias("fhlp"), col("fieldishidden").alias("fhid"), col("fieldincludeyear").alias("fiy"), col("fieldisfilterable").alias("fif"), substring("fieldlabel", 0, 15).alias("flbl"), substring(col("fieldoptions").cast("string"), 0, 30).alias("fopt"), "fieldorder", "fieldidrequired", "fieldresponses", "fieldtype").filter(col("wid") == 1).distinct().sort(col("sid"), col("fieldorder")).show(50, truncate=False)

templateFieldExplodedDF.select(col("workflowversion").alias("wid"), substring("sectionid", 0, 3).alias("sid"), col("sectionorder").alias("sord"), substring("sectiontitle",0, 50).alias("sttl"), col("sectiontype").alias("styp"), substring("fieldid", 0, 10).alias("fid"), substring("fieldlabel", 0, 15).alias("flbl"), "fieldorder", "fieldtype").filter(col("wid") == 1).distinct().sort(col("sid"), col("fieldorder")).show(50, truncate=False)
#sectiontitle_coalesce(fieldlabel, fieldtype)

In [None]:
#Printing a single JSON workflow metadata to look at all records in full
tmpdir = keys["tmp-dir"]
templateParsed.select("json").limit(1).write.mode("overwrite").json(tmpdir + "/templates")

In [None]:
parsedWorkflowMeta1 = templateParsed.select(col("json.id").alias("workflowid"), col("json.version").alias("workflowVersion"), explode("json.sections").alias("section")).select("workflowid", "workflowversion", col("section.id").alias("sectionid"), explode("section.fields").alias("field")).select("workflowid", "workflowversion", "sectionid", col("field.id").alias("fieldId")).distinct()

In [None]:
parsedWorkflowMeta1.show(truncate= False)

In [None]:
import re
from pyspark.sql.functions import udf, StringType, lit, isnull
#Create a UDF to transform the sectiontitle, fieldlable, fieldtype columns into a concatenated column
def generate_col(sectiontitle, fieldlabel, fieldtype):
    #''.join(char for char in sectiontitle if char.isalnum())
    sectiontitle_fmt = re.sub('[ ]+', '_', re.sub('[^A-Za-z0-9 ]+', '', sectiontitle)).lower()
    
    fieldlabel_fmt = ''
    if fieldlabel is None:
        fieldlabel_fmt = re.sub('[ ]+', '_', re.sub('[^A-Za-z0-9 ]+', '', fieldtype)).lower()
    else:
        fieldlabel_fmt = re.sub('[ ]+', '_', re.sub('[^A-Za-z0-9 ]+', '', fieldlabel)).lower()

    return 'dyn_'+sectiontitle_fmt+'_'+fieldlabel_fmt
#print(generate_col("Inspector's Contact Information", "null", "c"))    


generate_col_udf = udf(generate_col, StringType())
#templateFieldExplodedDF.withColumn("title", generate_col_udf(col("sectiontitle"), col("fieldlabel"), col("fieldtype")))\
#    .select("workflowversion", "sectionid", "fieldid", "fieldorder", "title", "fieldlabel")\
#    .filter(col("workflowversion") == 1)\
#    .show(50, truncate=False)

#.filter((col("workflowversion") == 1) & (isnull(col("fieldlabel"))))\

workflowTemplateSchema = templateFieldExplodedDF.withColumn("title", generate_col_udf(col("sectiontitle"), col("fieldlabel"), col("fieldtype")))\
    .select("workflowid", "workflowversion", "fieldid", "title")

workflowTemplateSchema.filter(col("workflowversion")==1).show(50, truncate=False)

In [None]:
#Checking if the fieldId and title are PK combination for a given workflow version
from pyspark.sql.functions import countDistinct, count
workflowTemplateSchema.groupBy("workflowid", "workflowversion")\
    .agg(countDistinct("fieldId"), countDistinct("title"), count("title"))\
    .sort(col("workflowversion"))\
    .show(truncate=False)
#(countDistinct("fieldid").alias("fieldid_cnt"), count(col("fieldid")).alias("tot_cnt")).show()

In [None]:
#Pivoting the workflowTemplateSchema dataframe
from pyspark.sql.functions import first

workflowSchemaPivot = workflowTemplateSchema\
    .groupBy("workflowid", "workflowversion")\
    .pivot("title")\
    .agg(first("fieldid"))\
    .sort("workflowversion")

workflowSchemaPivot.select("workflowid", 'workflowversion', 'dyn_observation_status','dyn_observation_dependentdropdownlist').show(truncate=False)

In [None]:
#Creating a dataframe for workflow attributes
templatesNonSchemaAttribute = templatesDF.select("workflowid", "workflowversion", col("name").alias("workflowname"), col("description").alias("workflowdescription"))

In [None]:
# Load data from submissions table for workflowid='a0ab07fa-fe40-4eb5-bdef-5b505defd91a'
#table = "(SELECT workflowid, workflowversion, count(1) as cnt from reach.submissions where workflowid='a611477c-41f1-4a9e-9721-d7afeaf55099' group by workflowid, workflowversion) AS t"
table = "(SELECT submissionid, currentstep, totalsteps, workflowid, workflowversion, createdts, lastupdatedts, createdby, lastupdatedname, createdbyname, recordid, countrycode, REPLACE(CONVERT(REPLACE(stepmetadata,'&amp;','&') using utf8),'','') as stepmetadata, tzoffset from reach.submissions where workflowid='a611477c-41f1-4a9e-9721-d7afeaf55099' and workflowversion<11) AS t"

submissionsDF = spark.read.jdbc(url, table, properties=properties)
submissionsDF.cache()

In [None]:
submissionsDF.select("submissionid", col("currentstep").alias("submissionCurrentStep"), "totalsteps", "workflowid", "workflowversion", col("createdts").alias("submissionCreatedTs"), col("lastupdatedts").alias("submissionLastUpdatedTs"), "recordid", "countrycode", "tzoffset").show(11, truncate=False)

In [None]:
submissionAttribute = submissionsDF.select("submissionid", col("currentstep").alias("submissionCurrentStep"), col("workflowid").alias("submissionWorkflowId"), col("workflowversion").alias("submissionWorkflowVersion"), col("createdts").alias("submissionCreatedTs"), col("lastupdatedts").alias("submissionLastUpdatedTs"), "recordid", col("countrycode").alias("submissionCountryCode"))

submissionAttribute.show(truncate=False)
submissionAttribute.count()

In [None]:
# Load data from responses table for workflowid='a0ab07fa-fe40-4eb5-bdef-5b505defd91a'
#table = "(SELECT workflowid, count(1) as cnt from reach.responses group by workflowid) AS t"
table = "(SELECT responseid, submissionid, fieldid, submittedby, submitter, lastupdatedts, siteid, REPLACE(CONVERT(REPLACE(value,'&amp;','&') using utf8),'','') as value, `order`, currentstep, createdTs, submittedByName, lastUpdatedBy, taskid from reach.responses where workflowid='a611477c-41f1-4a9e-9721-d7afeaf55099' and workflowVersion<11) AS t"

responsesDF = spark.read.jdbc(url, table, properties=properties)
responsesDF.cache()

In [None]:
responsesDF.select("responseid", "submissionid", "fieldid", "lastupdatedts", "siteid", col("value").alias("responseValue"), "order", "currentstep", "taskid").show(truncate=False)
responsesDF.count()

In [None]:
responseAttribute = responsesDF.select("responseid", col("submissionid").alias("responseSubmissionId"), "fieldid", col("value").alias("responseValue"), col("order").alias("responseOrder"), col("taskid").alias("responseTaskId"))
responseAttribute.show(truncate=False)

In [None]:
# Load data from tasks table for workflowid='a0ab07fa-fe40-4eb5-bdef-5b505defd91a'
#table = "(SELECT workflowid, count(1) as cnt from reach.tasks group by workflowid) AS t"
table = "(SELECT taskId, title, description, status, submissionid, sectionId from reach.tasks where workflowid='a611477c-41f1-4a9e-9721-d7afeaf55099' and workflowVersion<11) AS t"

tasksDF = spark.read.jdbc(url, table, properties=properties)
tasksDF.cache()

In [None]:
tasksDF.select("taskId", col("title").alias("taskTitle"), col("description").alias("taskDescription"), col("status").alias("taskStatus"), "submissionId").show(truncate=False)
tasksDF.count()

In [None]:
taskAttribute = tasksDF.select("taskId", col("title").alias("taskTitle"), col("description").alias("taskDescription"), col("status").alias("taskStatus"), col("submissionId").alias("taskSubmissionId"))
taskAttribute.show(truncate=False)

In [None]:
print(workflowSchemaPivot.printSchema)
print(templatesNonSchemaAttribute.printSchema)
print(submissionAttribute.printSchema)
print(responseAttribute.printSchema)
print(taskAttribute.printSchema)

In [None]:
#workflowSchemaPivot
#templatesNonSchemaAttribute
#submissionAttribute
#responseAttribute
#taskAttribute

#Join templatesNonSchemaAttribute with submissionAttribute
templateSubmissionJoin = templatesNonSchemaAttribute.join(submissionAttribute, (templatesNonSchemaAttribute.workflowid == submissionAttribute.submissionWorkflowId) & (templatesNonSchemaAttribute.workflowversion == submissionAttribute.submissionWorkflowVersion), how="inner")\
    .select("workflowid", "workflowversion", "workflowname", "workflowdescription", "submissionid", "submissionCurrentStep", "submissionCreatedTs", "submissionLastUpdatedTs", "recordId", "submissionCountryCode")

templateSubmissionJoin.show(10, truncate=False)

In [None]:
from pyspark.sql.functions import concat, coalesce
#Self aggregate response to create a map
responseAttributeConcat = responseAttribute.withColumn("fieldIdValue", concat('fieldId', lit(':'), 'responseValue'))\
    .withColumn("responseTaskIdCoalesced", coalesce("responseTaskId", lit("")))\
    .select('responseId', 'responseSubmissionId', 'responseOrder', 'responseTaskIdCoalesced', 'fieldIdValue')
#responseAttribute.filter(isnull("responseTaskId")).show(truncate=False)

responseAttributeConcat.show(truncate=False)


In [None]:
from pyspark.sql.functions import collect_set
#Aggregate all the fieldIdValue for a combination of submissionId, taskId into a single record
responseAggregateRecord = responseAttributeConcat.limit(20).groupBy("responseSubmissionId", "responseTaskIdCoalesced")\
                            .agg(collect_set("fieldIdValue").alias("fieldIdValues"))
                        
responseAggregateRecord.show(truncate=False)

In [None]:
#Join templateSubmissionJoin with responseAggregateRecord
submissionResponseJoin = templateSubmissionJoin.join(responseAggregateRecord, (templateSubmissionJoin.submissionid == responseAggregateRecord.responseSubmissionId), how='inner')\
    .select(col("workflowId").alias('submissionWorkflowId'), col("workflowVersion").alias('submissionWorkflowVersion'), "submissionid", "responseTaskIdCoalesced", "fieldIdValues")

submissionResponseJoin.show(truncate=False)

In [None]:
#Trying to flatten the table for following 3 columns
#4f842804-7da0-4a18-afb3-696c7b1a0991 > dyn_inspector39s_contact_information_inspector_phone_number
#57c50f5e-df29-4a11-af9d-06e29ab11747 > dyn_assignment_notes_insufficient_paperwork
#b98305e1-e911-4a98-9541-df6094937a75 > dyn_additional_details_contact_type

#Need to figure out the column names for following fieldIds
workflowTemplateSchema.filter(col("fieldid").isin('4f842804-7da0-4a18-afb3-696c7b1a0991','57c50f5e-df29-4a11-af9d-06e29ab11747','b98305e1-e911-4a98-9541-df6094937a75'))\
    .show(10, truncate=False)

In [None]:
###*************************DEBUG*************************###
workflowSchemaPivot.select("workflowid", "workflowversion", "dyn_inspector39s_contact_information_inspector_phone_number", "dyn_assignment_notes_insufficient_paperwork", "dyn_additional_details_contact_type").show(truncate=False)

In [None]:
###*************************DEBUG*************************###
for elem in workflowSchemaPivot.schema.names:
    print(elem)

In [None]:
###*************************DEBUG*************************###
debugDF01 = submissionResponseJoin.join(workflowSchemaPivot, (submissionResponseJoin.submissionWorkflowId == workflowSchemaPivot.workflowid) & (submissionResponseJoin.submissionWorkflowVersion == workflowSchemaPivot.workflowversion), how='inner')\
    .drop('submissionWorkflowId')\
    .drop('submissionWorkflowVersion')
    

debugDF01.select('workflowId', 'workflowVersion','submissionId', 'responseTaskIdCoalesced', 'fieldIdValues', 'dyn_inspector39s_contact_information_inspector_phone_number', 'dyn_assignment_notes_insufficient_paperwork', 'dyn_additional_details_contact_type')\
    .show(30, truncate=False)

In [None]:
#Map the fieldId value to the respective column
def search_and_return(fieldId, listKeyValue):
    returnValue = ''
    for keyValue in listKeyValue:
        key = keyValue.split(':')[0]
        value = keyValue.split(':')[1]
        if str(fieldId) == key:
            returnValue = value
    return returnValue
    


sampleKeyValueList = ['12:a', '11:b', '13:c']
print(search_and_return(13, sampleKeyValueList))

In [None]:
#Create a UDF out of the function
search_and_return_udf = udf(search_and_return, StringType())

debugDF01.withColumn("dyn_inspector39s_contact_information_inspector_phone_number", search_and_return_udf(col('dyn_inspector39s_contact_information_inspector_phone_number'), 'fieldIdValues'))\
    .withColumn("dyn_assignment_notes_insufficient_paperwork", search_and_return_udf(col('dyn_assignment_notes_insufficient_paperwork'), 'fieldIdValues'))\
    .withColumn("dyn_additional_details_contact_type", search_and_return_udf(col('dyn_additional_details_contact_type'), 'fieldIdValues'))\
    .select('workflowid', 'workflowversion', 'submissionId', 'responseTaskIdCoalesced', 'dyn_inspector39s_contact_information_inspector_phone_number', 'dyn_assignment_notes_insufficient_paperwork', 'dyn_additional_details_contact_type')\
    .show(30, truncate=False)

In [None]:
#Making the replace process automated
workflowSchemaDFColumnList = workflowSchemaPivot.schema.names
dynamicColumnList = list(filter(lambda x: 'dyn_' in x, workflowSchemaDFColumnList))

for colName in dynamicColumnList:
    print(colName)

In [None]:
#debugDF01.withColumn(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), search_and_return_udf(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), 'fieldIdValues'))\
debugDF03 = debugDF01

for rName in dynamicColumnList:
    debugDF03 = debugDF03.withColumn(rName, search_and_return_udf(col(rName), 'fieldIdValues'))
#debugDF03 = debugDF03.withColumn(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), search_and_return_udf(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), 'fieldIdValues'))
#debugDF03 = debugDF03.withColumn(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), search_and_return_udf(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), 'fieldIdValues'))

#debugDF03 = debugDF03.withColumn('workflowId', lit('workflowId'))
#debugDF03 = debugDF03.withColumn('workflowId', col('workflowId'))

#debugDF03.withColumn('dyn_inspector39s_contact_information_inspector_phone_number', search_and_return_udf(F.col('dyn_inspector39s_contact_information_inspector_phone_number'), 'fieldIdValues'))\
debugDF03.select('workflowid', 'workflowversion', 'submissionId', 'responseTaskIdCoalesced', 'dyn_inspector39s_contact_information_inspector_phone_number', 'dyn_assignment_notes_insufficient_paperwork', 'dyn_additional_details_contact_type')\
    .show(30, truncate=False)