In [0]:
#importing spark session
from pyspark.sql.session import SparkSession
spark=SparkSession.builder.getOrCreate()

from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ShortType, LongType, DoubleType,FloatType,DateType,BooleanType,TimestampType

from pyspark.sql.functions import *

from pyspark.sql.functions import lit,initcap,col,initcap

struct=StructType([StructField('id',StringType(), True),StructField('firstname',StringType(), True),StructField('lastname',StringType(), True),StructField('age',StringType(), True),StructField('profession',StringType(), True)])

#reading the csv file and converting it into spark dataframe adding schema
rawdf1=spark.read.schema(struct).csv("/Volumes/workspace/wd36schema/ingestion_volume/source/custsmodified",sep=',',mode="permissive")
rawdf1.show(20)
rawdf1.count()

#----Active Data Munging----

#cleansing
#na.drop will remove the rows which has null values in all the columns, if subset is specified then records having nulls in the specified columns will be removed
cleanseddf1=rawdf1.na.drop(how='all',subset=["firstname","lastname"])
display(cleanseddf1.count())

#scrubbing

#na.fill will fill the nulls with provided values in specified coulmns
scrubbeddf1=cleanseddf1.na.fill("not provided",subset=["lastname","profession"])

#na.replace will replace a value with another in given subset
scrubbeddf2=scrubbeddf1.na.replace("not provided","NA",subset=["lastname"])
repl_dict_list={"Actor":"Celebrity","Pilot":"Captain"}
scrubbeddf3=scrubbeddf2.na.replace(repl_dict_list,subset=["profession"])
display(scrubbeddf3.show(10))

#De-duplication

#removing row level duplicate using distinct
dedupdf1=scrubbeddf3.distinct()
display(dedupdf1.show(10))


#removing column level duplicate using dropDuplicates
dedupdf2=dedupdf1.coalesce(1).dropDuplicates(['id']) #coalesce is used to reduce the number of partitions to 1, so that all records will be brought into 1 partition and then duplicates will be removed, this is recommended only for very less number of records not for huge records, since it will cause performance issue
display(dedupdf2.show(10))

#display(dedupdf1.where("id in ('4000003')"))
#below will dlete the duplicate records prioritised on id and retain the record with highest age for the id=4000003, using order by
#dedupdf1.coalesce(1).where("id in('4000003')").orderBy(["id","age"], ascending=[True,False]).show()
#dedupdf1.coalesce(1).where("id in('4000003')").orderBy(["id","age"], ascending=[True,False]).dropDuplicates(['id']).show()

#Data Standardization

#Standardization 1 - Column Enrichment
stddf1=dedupdf2.withColumn("sourcesystem",lit("source"))
display(stddf1.show(10))

#standardization 2 - Column Uniformity
stddf2 =stddf1.withColumn("profession",initcap(stddf1.profession))
display(stddf2.show(10))

#standardization 3 - Column Transformation

#stddf2.select(col("age")).distinct().show()
#stddf2.select(col("id")).distinct().orderBy("id").show()

#rlike is regular expression like function that help us identify any string data in our DF column
#stddf2.where("id like 't%'").show()
#stddf2.where("id rlike 'a-z'").show()
#stddf2.where("id rlike 'A-Z'").show()
stddf2.where("id rlike '[a-zA-Z]'").show()
#stddf2.where("id rlike '[A-Z]'").show()
stddf2.where("age rlike '[^0-9]'").show() #checking for any non number values in age column
#stddf2.where("age rlike '0-9'").show() #checking for any number values in age column

#stddf3 = stddf2.withColumn("id",replace(col("id"),"ten","10")) #to replace only one value
#display(stddf3.where ("firstname = 'Elsie'")) -#to check if id is updated successfully
repl_list={"one":"1","two":"2","three":"3","four":"4","five":"5","six":"6","seven":"7","eight":"8","nine":"9","ten":"10"}
stdf3 = stddf2.na.replace(repl_list,subset=["id"]) #created dictonary to replace list of 
stddf3 = stdf3.withColumn("age",regexp_replace("age","-","")) #using regular expression replace function to replace 7-7 with 77
stdf3.where("id rlike '[a-zA-Z]'").show()
stddf3.where("age rlike '[^0-9]'").show() 

#standardization 4 - Datatype Standardization
stddf4=stddf3.withColumn("id", stddf3.id.cast(IntegerType())).withColumn("age", stddf3.age.cast(ShortType()))
stddf4.printSchema()

#standardization 5 - ColumnRename
stddf5=stddf4.withColumnsRenamed({"id":"custid","sourcesystem":"srcsys"})
display(stddf5.show(10))

#standardization 6 - ReorderColumn
stddf6=stddf5.select("custid","age","firstname","lastname","profession","srcsys")
mungeddf=stddf6
display(mungeddf)

In [0]:
#DataEnrichment
#performing EDA on munged data
mungeddf.printSchema()
display(mungeddf.take(20))
display("total rows",len(mungeddf.collect()))
display(mungeddf.summary())

mungeddf.take(20)
mungeddf.show(20)
#mungeddf.collect()
#mungeddf.count()

# adding date columns
enrichdf1=mungeddf.withColumns({"datadt":lit("25/30/12"),"loaddt":current_date()})
display(enrichdf1)
#or
enrichdf1=mungeddf.selectExpr("*"," '25/30/12' as datadt","current_date() as loaddt")
#or
enrichdf1=mungeddf.select("*",lit('25/30/12').alias('datadt'),current_date().alias('loaddt'))
enrichdf1.printSchema()

#derive columns
enrichdf2=enrichdf1.withColumn("profflag",substring("profession",1,1))
#or
enrichdf2=enrichdf1.select("*",substring("profession",1,1).alias("profflag"))
#or
enrichdf2=enrichdf1.selectExpr("*","substring(profession,1,1) as profflag")
display(enrichdf2)

#Rename of columns
enrichdf3=enrichdf2.withColumnsRenamed({"profflag":"professionflag","srcsys":"sourcesystem"})
display(enrichdf3)

#modify/replace columns
enrichdf4=enrichdf3.withColumn("profession",col("professionflag"))
#or
enrichdf4=enrichdf3.select("custid","age","firstname","lastname","profession",concat("profession",lit("-"),"professionflag").alias("profession_flag"),"sourcesystem","datadt","loaddt")
display(enrichdf4)

#remove column
enrichdf5=enrichdf4.drop("profession_flag")
display(enrichdf5)

#splitting data
splitdf=enrichdf4.withColumn("profession",split("profession_flag",'-'))
splitdf=splitdf.withColumn("profession_flag",col("profession")[1])
splitdf=splitdf.withColumn("profession",col("profession")[0])
display(splitdf)
#or
splitdf1=enrichdf4.withColumn("shortprof",upper(substring("profession",1,3).alias("prof"))).withColumn("profession_flag",split("profession_flag",'-')).withColumn("profession_flag",col("profession_flag")[1])
display(splitdf1)

#merging of column
mergeddf=splitdf1.withColumn("fullname",concat("firstname",lit(" "),"lastname"))
display(mergeddf)
#or
mergeddf=splitdf1.select("custid","age",concat("firstname",lit(" "),"lastname").alias("fullname"),"profession","profession_flag","shortprof","sourcesystem","datadt","loaddt")
display(mergeddf)

#column formatting/type casting
formatteddf=mergeddf.withColumn("datadt",to_date("datadt","yy/dd/mm"))#25/30/12 -> 2025-12-30
display(formatteddf)
formatteddf.printSchema()

In [0]:
#how to write a python program to append a variable value to another variable and use it inside the selectExpr
name='suganya'
sqlexpression=f"'{name}' as owner"
print(sqlexpression)
mungeddf.selectExpr("*",sqlexpression).display()

In [0]:
#Data Customization

#defining a UDF - user defined spark ready function, since there is no predefined spark ready function available to perform your operation
#Create Python Custom Function with complex logics

#Calculating age category from the given age of the customer
def agecat(age):
    if age is None:
        return "Unknown"
    elif age<=10:
        return "child"
    elif age>10 and age<=18:
        return "teenager"
    elif age>18 and age<=30:
        return "young"
    elif age>30 and age<=50:
        return "middleaged"
    else:
        return "senior"
    
print(agecat(10))
    
from pyspark.sql.functions import udf
udfagecat=udf(agecat) #promoting a python function to userdefined spar ready function
customdf=formatteddf.withColumn("agecat",udfagecat("age"))
display(customdf.take(10))


In [0]:
#Data Curation

