In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType, DateType

spark =SparkSession.builder.appName("DataWrangling").getOrCreate()

In [0]:
data = [
    (1, "Alice", None, "2023-01-10"),
    (2, "Bob", 30, "2023-01-15"),
    (3, "Charlie", 22, "2023-02-20"),
    (4, "David", None, "2023-02-25"),
    (5, "Eve", 45, "2023-03-05")
]
columns = ["id","name","age","doj"]
df=spark.createDataFrame(data,columns)
df.show()

In [0]:
df_filled = df.na.fill(32, subset=['age'])

In [0]:
df_filled.show()

In [0]:
df_renamed=df.withColumnRenamed("doj","doh")
df_renamed.show(1, truncate=False)


In [0]:
df_casted =df.withColumn("age",f.col("age").cast(IntegerType()))
df_casted.printSchema()

In [0]:
column_casting_map = {"age":IntegerType(), "doj":DateType()}
for col_name, d_type in column_casting_map.items():
  df_casted = df.withColumn(col_name, f.col(col_name).cast(d_type))
  df_casted.printSchema()

In [0]:
df_seniority = df_casted.withColumn("seniority", 
                                    f.when(f.col("age")>40,"senior")
                                    .when(f.col("age").isNotNull(),"Junior")
                                    .otherwise("Unknown"))
df_seniority.show()                                    

In [0]:
df_seniority.printSchema()

In [0]:
df_dates = df_casted.withColumn("join_year",f.year(f.col("doj")))
df_dates.show()
df_diff=df_dates.withColumn("days_since_joined",f.datediff(f.current_date(),f.col("doj")))
df_diff.show()

In [0]:
df_agg = df.agg(
    f.avg("age").alias("avg_age"),
    f.max("age").alias("max_age"),
    f.min("age").alias("min_age"),
    f.count("age").alias("total_non_null_age"),
    f.count("*").alias("total_records"),
    (f.count("*") - f.count("age")).alias("null_age_count")
)
display(df_agg)

In [0]:
from pyspark.sql import Window

In [0]:
# Create Sample Data
sales_data = [
    ("West", "Alice", 1000),
    ("East", "Bob", 1500),
    ("West", "Charlie", 1200),
    ("East", "David", 800),
    ("West", "Eve", 1000)
]
sales_df = spark.createDataFrame(sales_data, ["region", "employee", "sales"])


In [0]:
window_spec = Window.partitionBy("region").orderBy(f.desc("sales"))
ranked_df = sales_df.withColumn("rank", f.rank().over(window_spec))
display(ranked_df)