In [0]:
#importing spark session
from pyspark.sql.session import SparkSession
spark=SparkSession.builder.getOrCreate()

from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ShortType, LongType, DoubleType,FloatType,DateType,BooleanType,TimestampType

from pyspark.sql.functions import *

from pyspark.sql.functions import lit,initcap,col,initcap

struct=StructType([StructField('id',StringType(), True),StructField('firstname',StringType(), True),StructField('lastname',StringType(), True),StructField('age',StringType(), True),StructField('profession',StringType(), True)])

#reading the csv file and converting it into spark dataframe adding schema
rawdf1=spark.read.schema(struct).csv("/Volumes/workspace/wd36schema/ingestion_volume/source/custsmodified",sep=',',mode="permissive")
rawdf1.show(20)
rawdf1.count()

#----Active Data Munging----

#cleansing
#na.drop will remove the rows which has null values in all the columns, if subset is specified then records having nulls in the specified columns will be removed
cleanseddf1=rawdf1.na.drop(how='all',subset=["firstname","lastname"])
display(cleanseddf1.count())

#scrubbing

#na.fill will fill the nulls with provided values in specified coulmns
scrubbeddf1=cleanseddf1.na.fill("not provided",subset=["lastname","profession"])

#na.replace will replace a value with another in given subset
scrubbeddf2=scrubbeddf1.na.replace("not provided","NA",subset=["lastname"])
repl_dict_list={"Actor":"Celebrity","Pilot":"Captain"}
scrubbeddf3=scrubbeddf2.na.replace(repl_dict_list,subset=["profession"])
display(scrubbeddf3.show(10))

#De-duplication

#removing row level duplicate using distinct
dedupdf1=scrubbeddf3.distinct()
display(dedupdf1.show(10))


#removing column level duplicate using dropDuplicates
dedupdf2=dedupdf1.coalesce(1).dropDuplicates(['id'])
display(dedupdf2.show(10))

#display(dedupdf1.where("id in ('4000003')"))
#below will dlete the duplicate records prioritised on id and retain the record with highest age for the id=4000003, using order by
#dedupdf1.coalesce(1).where("id in('4000003')").orderBy(["id","age"], ascending=[True,False]).show()
#dedupdf1.coalesce(1).where("id in('4000003')").orderBy(["id","age"], ascending=[True,False]).dropDuplicates(['id']).show()

#Data Standardization

#Standardization 1 - Column Enrichment
stddf1=dedupdf2.withColumn("sourcesystem",lit("source"))
display(stddf1.show(10))

#standardization 2 - Column Uniformity
stddf2 =stddf1.withColumn("profession",initcap(stddf1.profession))
display(stddf2.show(10))

#standardization 3 - Column Transformation

#stddf2.select(col("age")).distinct().show()
#stddf2.select(col("id")).distinct().orderBy("id").show()

#rlike is regular expression like function that help us identify any string data in our DF column
#stddf2.where("id like 't%'").show()
#stddf2.where("id rlike 'a-z'").show()
#stddf2.where("id rlike 'A-Z'").show()
stddf2.where("id rlike '[a-zA-Z]'").show()
#stddf2.where("id rlike '[A-Z]'").show()
stddf2.where("age rlike '[^0-9]'").show() #checking for any non number values in age column
#stddf2.where("age rlike '0-9'").show() #checking for any number values in age column

#stdfdf3 = stddf2.withColumn("id",replace(col("id"),"ten","10")) #to replace only one value
#display(stddf3.where ("firstname = 'Elsie'")) -#to check if id is updated successfully
repl_list={"one":"1","two":"2","three":"3","four":"4","five":"5","six":"6","seven":"7","eight":"8","nine":"9","ten":"10"}
stdf3 = stddf2.na.replace(repl_list,subset=["id"]) #created dictonary to replace list of 
stddf3 = stddf2.withColumn("age",regexp_replace("age","-","")) #using regular expression replace function to replace 7-7 with 77

#standardization 4 - Datatype Standardization
stddf4=stddf3.withColumn("id", stddf3.id.cast(IntegerType())).withColumn("age", stddf3.age.cast(ShortType()))
stddf4.printSchema()

#standardization 5 - ColumnRename
stddf5=stddf4.withColumnsRenamed({"id":"custid","sourcesystem":"srcsys"})
display(stddf5.show(10))

#standardization 6 -