# Key Terminology [](url)

#### Validation 
- Persmissive, 
- dropmalformated, 
- Failfast

#### When you go for Schema Merging/Melting and Schema Evolution?
 - unionByName,allowMissingColumns (Multiple file from different location)

#### Schema Evolution 
- mergeSchema=True

##### Rejection Strategy 
 - columnNameOfCorruptRecord="corruptdata"

#####  Multiple files in multiple paths or sub paths
- recursiveFileLookup=True,pathGlobFilter

In [0]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

#Important passive Munging - EDA of schema/structure functions we can use

In [0]:
dbutils.fs.ls("/Volumes/lakehouse1/dbread/read_volume/commondata/")

In [0]:
display(
    spark.read.text("/Volumes/lakehouse1/dbread/read_volume/commondata/custsmodified")
)


In [0]:
df1 = spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/commondata/custsmodified", header=False, inferSchema=True).toDF("id", "name", "lname", "age", "prof")
display(df1)
# df1.printSchema()
# display([col for col in df1.columns])
# print(df1.schema)
# display(df1.dtypes)
# display(df1.describe())
# display(df1.summary())


In [0]:
print("actual count of the data",df1.count())
print("de-duplicated record (all columns) count",df1.distinct().count())#de duplicate the entire columns of the given  dataframe
print("de-duplicated record (all columns) count",df1.dropDuplicates().count())#de duplicate the entire columns of the given  dataframe
print("de-duplicated given cid column count",df1.dropDuplicates(['id']).count())#de duplicate the entire columns of the given  dataframe
display(df1.describe())
display(df1.summary())

In [0]:
#1. Single file
struct1="id string, firstname string, lastname string, age string, profession string"
rawdf1=spark.read.schema(struct1).csv(path="/Volumes/lakehouse1/dbread/read_volume/commondata/dummy1/custs")
#display(rawdf1.count())

rawdf1=spark.read.schema(struct1).csv(path=["/Volumes/lakehouse1/dbread/read_volume/commondata/custsmodified","/Volumes/lakehouse1/dbread/read_volume/commondata/custsmodified_NY","/Volumes/lakehouse1/dbread/read_volume/commondata/dummy1/custs"],pathGlobFilter="custs*",recursiveFileLookup=True)
display(rawdf1.count())

In [0]:
strt1="id string, firstname string, lastname string, age string, profession string"
rawdf1=spark.read.schema(strt1).csv(path=["/Volumes/lakehouse1/dbread/read_volume/commondata/"],recursiveFileLookup=True,pathGlobFilter="custsmodified_N*")
display(rawdf1.count())
display(rawdf1)

strt2="id string, firstname string, age string, profession string,city string"
rawdf2=spark.read.schema(strt1).csv(path=["/Volumes/lakehouse1/dbread/read_volume/commondata/"],recursiveFileLookup=True,pathGlobFilter="custsmodified_T*")
display(rawdf2.count())
display(rawdf2)

In [0]:
rawdf_merged=rawdf1.unionByName(rawdf2,allowMissingColumns=True)
display(rawdf_merged)

###Combining Data + Schema Evolution/Merging (Structuring) - Preliminary Datamunging


####**Single File**

In [0]:
struct1="id string, firstname string, lastname string, age string, profession string"
rawdf1=spark.read.schema(struct1).csv(path="/Volumes/lakehouse1/dbread/read_volume/custsmodified")
print(f"Single file total count",rawdf1.count())
#display(rawdf1)

#Multiple files (with different names)
rawdf1=spark.read.schema(struct1).csv(path=["/Volumes/lakehouse1/dbread/read_volume/custsmodified","/Volumes/lakehouse1/dbread/read_volume/custsmodified_NY","/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified_TX"])
print(f"Multiple files (with different names)",rawdf1.count())
#display(rawdf1)

#Multiple files (with different names, recursive)
rawdf1 = spark.read.schema(struct1).csv(
    path="/Volumes/lakehouse1/dbread/read_volume/",
    pathGlobFilter="custs*",
    recursiveFileLookup=True
)
print(f"Multiple files (with different names, recursive)",rawdf1.count())
display(rawdf1)



In [0]:
schem1 = "id string, firstname string, lastname string, age int, profession string"
rawdf1=spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified_NY",schema=schem1)
display(rawdf1)

schem2 = "id string, firstname string, age int, profession string, city string"
rawdf2 =spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified_TX",schema=schem2)
display(rawdf2)

rawdf_merged= rawdf1.unionByName(rawdf2, allowMissingColumns=True)
display(rawdf_merged)
#rawdf_merged=rawdf1.unionByName(rawdf2

#### Validation â€“ Data Exploration through Cleansing and Scrubbing

- **Scrubbing**: Applied **Permissive mode** to handle unexpected data types by converting invalid values to **NULL**.
- **Cleansing**: Applied **Drop Malformed mode** to eliminate records containing invalid or malformed data.


In [0]:
from pyspark.sql.types import *
strt1="id int, firstname string, lastname string, age int, profession string"

df_raw=spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified")
df_raw.show(20)

strt11=StructType([StructField('id', IntegerType(), True), StructField('firstname', StringType(), True), StructField('lastname', StringType(), True), StructField('age', IntegerType(), True), StructField('profession', StringType(), True),StructField("corruptdata",StringType(),True)])

dfmethod1=spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified", schema=strt11,mode="PERMISSIVE",header=False)

print("dfmethod1 entire count of data",dfmethod1.count())
print("dfmethod1 after scrubbing, count of data",len(dfmethod1.collect()))
display(dfmethod1)


dfmethod2=spark.read.csv("/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified", schema=strt11,mode="dropMalformed",header=False)

print("dfmethod2 entire count of data",dfmethod2.count())
print("dfmethod2 after scrubbing, count of data",len(dfmethod2.collect()))
display(dfmethod2)

dfmethod3 = spark.read.csv(
    "/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified",
    schema=strt11,
    mode="PERMISSIVE",
    header=False,
    columnNameOfCorruptRecord="corruptdata"
)

print("dfmethod3 entire count of data",dfmethod3.count())
print("dfmethod3 after scrubbing, count of data",len(dfmethod3.collect()))
display(dfmethod3)


In [0]:
#Before actively Cleansing or Scrubbing - We have to create a Rejection Strategy to reduce data challenges in the future
strt11 = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("profession", StringType(), True),
    StructField("corruptdata", StringType(), True)
])

dfmethod3 = spark.read.schema(strt11).csv(
    "/Volumes/lakehouse1/dbread/read_volume/sub/custsmodified",
    mode="PERMISSIVE",
    header=False,
    columnNameOfCorruptRecord="corruptdata"
)

display(dfmethod3)


print("entire count of data", dfmethod3.count())
df_reject = dfmethod3.where("corruptdata is not null")
df_reject.drop("corruptdata").write.mode("overwrite").option("header", True).option("delimiter", ",").csv("/Volumes/lakehouse1/dbread/read_volume/sub/rejects/")
print("Data to reject or update the source", df_reject.count())

#####Cleansing
It is a process of cleaning/removing or making the data more clean Eg. Cutting/removing debris portion of the potato

In [0]:
cleansed_df1=dfmethod3.na.drop(how="any")#drop the row, if any one column in our df row contains null
#cleansed_df1=dfmethod3.na.drop(how="any",subset=["id","age"])#drop the row, if any one column id/age contains null
print("cleansed any DF count",len(cleansed_df1.collect()))
display(cleansed_df1.take(50))

In [0]:
#cleansed_df2=dfmethod3.na.drop(how="all")#drop the row, if all the columns in our df row contains null
cleansed_df2=dfmethod3.na.drop(how="all",subset=["id","profession"])#drop the row, if all the columns (id,profession) in our df row contains null
print("cleansed all DF count",len(cleansed_df2.collect()))
display(cleansed_df2.take(50))

In [0]:
#Before scrubbing, lets take the right cleansed data with id as null and entire row as null removed out
#Finally I am arriving for our current data, lets perform the best cleansing
cleansed_df=dfmethod3.na.drop(subset=["id"]).na.drop(how="all")
print("Final cleansed DF",len(cleansed_df.collect()))
display(cleansed_df.take(15))

In [0]:
scrubbed_df1=cleansed_df.na.fill("na",subset=["firstname","lastname"]).na.fill("not provided",subset=["profession"])
scrubbed_df2=scrubbed_df1.na.replace("IT","Information Technologies",subset=["profession"]).na.replace("Pilot","Aircraft Pilot",subset=["profession"])
display(scrubbed_df2.take(15))

In [0]:
dict1={"IT":"Information Technologies","Pilot":"Aircraft Pilot","Actor":"Celebrity"}
scrubbed_df=scrubbed_df1.na.replace(dict1,subset=["profession"])
print("scrubbed DF",len(scrubbed_df.collect()))
display(scrubbed_df.take(15))