In [0]:
data=[
    ("product1",100,"categrory1"),
    ("product2",200,"category2"),
    ("product3",300,"category3")
]
columns=["productname","price","category"]
df=spark.createDataFrame(data=data,schema=columns)
display(df)

productname,price,category
product1,100,categrory1
product2,200,category2
product3,300,category3


In [0]:
from pyspark.sql.functions import transform,col
def transform_percentage(df,discount_percentage):
    return df.withColumn("discountprice",col("price")*(1-discount_percentage/100))
percentage=20
#transform_df=df.transform(transform_percentage,percentage)
transform_df=df.transform(lambda df:transform_percentage(df,percentage))
transform_df.display()

productname,price,category,discountprice
product1,100,categrory1,80.0
product2,200,category2,160.0
product3,300,category3,240.0


In [0]:
from pyspark.sql.functions import col,trim,log,when,concat_ws,lit
data=[("Alice",34),
      ("Bob",45),
      ("Cathy",29)]
df=spark.createDataFrame(data,["Name","Age"])
display(df)

Name,Age
Alice,34
Bob,45
Cathy,29


In [0]:
age_threshold=40
log_base=10

def clean_data(df):
    return df.withColumn("Name",trim(col("Name")))

def conditiona_transform(df,threshold):
    return df.withColumn("Agecategory",when(col("Age")<lit(threshold),"Young").otherwise("Mature"))

def feature_engineering(df,base):
    return df.withColumn("LogAge",log(col("Age"))/log(lit(base)))

def combine_columns(df):
    return df.withColumn("Name_Age",concat_ws("_",col("Name"),col("Age")))

df_transformed=(df.transform(clean_data)
                .transform(lambda df:conditiona_transform(df,age_threshold))
                .transform(lambda df:feature_engineering(df,log_base))
                .transform(combine_columns))
df_transformed.display()

Name,Age,Agecategory,LogAge,Name_Age
Alice,34,Young,1.5314789170422551,Alice_34
Bob,45,Mature,1.6532125137753435,Bob_45
Cathy,29,Young,1.462397997898956,Cathy_29


In [0]:
from pyspark.sql.functions import col,lit,when,avg,sum as _sum
data=[
    ("Alice",34,"2023-06-01",3000.0),
    ("Bob",45,"2023-06-02",None),
    ("Cathy",None,"2023-06-03",2500.0),
    ("Alice",34,"2023-06-01",3000.0),
    (None,45,None,4000.0)
]
df=spark.createDataFrame(data,["Name","Age","Date","Salary"])

In [0]:
name_fill="Unknown"
age_fill=0
date_fill="1900-01-01"
salary_fill=0.0
age_threshold=30
salary_threshold=2000
bonus_percentage=0.10

def handle_nulls(df,name_fill,age_fill,date_fill,salary_fill):
    return df.fillna({
        "Name":name_fill,
        "Age":age_fill,
        "Date":date_fill,
        "Salary":salary_fill
    })

def remove_duplicates(df):
    return df.dropDuplicates()

def standardize_datatypes(df):
    return df.withColumn("Age",col("Age").cast("integer")).withColumn("Date",col("Date").cast("date")).withColumn("Salary",col("Salary").cast("double"))

def filter_rows(df,age_threshold,salary_threshold):
    return df.filter((col("Age")>lit(age_threshold)) &(col("Salary")>lit(salary_threshold)))

def add_bonus_column(df,bonus_percentage):
    return df.withColumn("Bonus",col("Salary")*lit(bonus_percentage))

def group_and_aggregate(df):
    return df.groupBy("Name").agg(avg("Age").alias("Avg_age"),
    _sum("Salary").alias("Total_salary"),_sum("Bonus").alias("Total_Bonus")                            )

In [0]:
df_transform=(df.transform(lambda df: handle_nulls(df,name_fill,age_fill,date_fill,salary_fill))).transform(remove_duplicates).transform(standardize_datatypes).transform(lambda df:filter_rows(df,age_threshold,salary_threshold)).transform(lambda df:add_bonus_column(df,bonus_percentage)).transform(group_and_aggregate)
display(df_transform)

Name,Avg_age,Total_salary,Total_Bonus
Unknown,45.0,4000.0,400.0
Alice,34.0,3000.0,300.0
