In [0]:
df = spark.read.csv(
    "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()


+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  1| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
|  2| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
|  3| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
|  4| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
|  5| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows
root
 |-- _c0: integer (nullable = true)
 |-- carat: double (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: double (nullable = true)
 |-- table: double (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z

STRING FUNCTIONS

In [0]:
from pyspark.sql import functions as F

string_df = df.select(
    "cut",
    F.lower("cut").alias("lowercase"),
    F.upper("cut").alias("uppercase"),
    F.length("cut").alias("length"),
    F.trim("cut").alias("trimmed"),
    F.concat_ws("-", "cut", "color").alias("combined"),
    F.substring("cut", 1, 3).alias("first3"),
    F.regexp_replace("cut", " ", "_").alias("no_spaces")
)

string_df.show(10, truncate=False)


+---------+---------+---------+------+---------+-----------+------+---------+
|cut      |lowercase|uppercase|length|trimmed  |combined   |first3|no_spaces|
+---------+---------+---------+------+---------+-----------+------+---------+
|Ideal    |ideal    |IDEAL    |5     |Ideal    |Ideal-E    |Ide   |Ideal    |
|Premium  |premium  |PREMIUM  |7     |Premium  |Premium-E  |Pre   |Premium  |
|Good     |good     |GOOD     |4     |Good     |Good-E     |Goo   |Good     |
|Premium  |premium  |PREMIUM  |7     |Premium  |Premium-I  |Pre   |Premium  |
|Good     |good     |GOOD     |4     |Good     |Good-J     |Goo   |Good     |
|Very Good|very good|VERY GOOD|9     |Very Good|Very Good-J|Ver   |Very_Good|
|Very Good|very good|VERY GOOD|9     |Very Good|Very Good-I|Ver   |Very_Good|
|Very Good|very good|VERY GOOD|9     |Very Good|Very Good-H|Ver   |Very_Good|
|Fair     |fair     |FAIR     |4     |Fair     |Fair-E     |Fai   |Fair     |
|Very Good|very good|VERY GOOD|9     |Very Good|Very Good-H|Ver 

In [0]:
df2 = df.withColumn("today", F.current_date())


In [0]:
date_df = df2.select(
    "today",
    F.year("today").alias("year"),
    F.month("today").alias("month"),
    F.dayofmonth("today").alias("day"),
    F.weekofyear("today").alias("week"),
    F.date_add("today", 10).alias("after10days"),
    F.last_day("today").alias("month_end")
)

date_df.show()


+----------+----+-----+---+----+-----------+----------+
|     today|year|month|day|week|after10days| month_end|
+----------+----+-----+---+----+-----------+----------+
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025-12-31|
|2025-12-11|2025|   12| 11|  50| 2025-12-21|2025

TIMESTAMP

In [0]:
df3 = df2.withColumn("now", F.current_timestamp())


In [0]:
timestamp_df = df3.select(
    "now",
    F.hour("now").alias("hour"),
    F.minute("now").alias("minute"),
    F.second("now").alias("second"),
    F.unix_timestamp("now").alias("epoch"),
    F.from_unixtime(F.unix_timestamp("now")).alias("back_to_ts")
)

timestamp_df.show()


+--------------------+----+------+------+----------+-------------------+
|                 now|hour|minute|second|     epoch|         back_to_ts|
+--------------------+----+------+------+----------+-------------------+
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948|2025-12-11 08:49:08|
|2025-12-11 08:49:...|   8|    49|     8|1765442948

In [0]:
numeric_df = df.select(
    "carat", "price",
    F.round("price", 1).alias("rounded_price"),
    F.abs("carat").alias("abs_carat"),
    F.sqrt("price").alias("sqrt_price"),
    F.pow("carat", 2).alias("carat_squared")
)

numeric_df.show(10)


+-----+-----+-------------+---------+------------------+-------------------+
|carat|price|rounded_price|abs_carat|        sqrt_price|      carat_squared|
+-----+-----+-------------+---------+------------------+-------------------+
| 0.23|  326|          326|     0.23| 18.05547008526779|             0.0529|
| 0.21|  326|          326|     0.21| 18.05547008526779|0.04409999999999999|
| 0.23|  327|          327|     0.23|18.083141320025124|             0.0529|
| 0.29|  334|          334|     0.29|18.275666882497067|             0.0841|
| 0.31|  335|          335|     0.31|18.303005217723125|             0.0961|
| 0.24|  336|          336|     0.24| 18.33030277982336|             0.0576|
| 0.24|  336|          336|     0.24| 18.33030277982336|             0.0576|
| 0.26|  337|          337|     0.26| 18.35755975068582|0.06760000000000001|
| 0.22|  337|          337|     0.22| 18.35755975068582|             0.0484|
| 0.23|  338|          338|     0.23|18.384776310850235|             0.0529|

In [0]:
agg_df = df.groupBy("cut").agg(
    F.avg("price").alias("avg_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price"),
    F.count("*").alias("total_records")
)

agg_df.show()


+---------+------------------+---------+---------+-------------+
|      cut|         avg_price|min_price|max_price|total_records|
+---------+------------------+---------+---------+-------------+
|    Ideal| 3457.541970210199|      326|    18806|        21551|
|     Fair| 4358.757763975155|      337|    18574|         1610|
|  Premium|4584.2577042999055|      326|    18823|        13791|
|Very Good|3981.7598907465654|      336|    18818|        12082|
|     Good| 3928.864451691806|      327|    18788|         4906|
+---------+------------------+---------+---------+-------------+



JOINS IN PYSPARK 

In [0]:
df = spark.read.csv(
    "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv",
    header=True,
    inferSchema=True
)

df.show(5)



+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  1| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
|  2| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
|  3| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
|  4| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
|  5| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows


In [0]:
big_df = df


In [0]:
small_df = df.select("cut").distinct()


In [0]:
print("big_df count:", big_df.count())
print("small_df count:", small_df.count())


big_df count: 53940
small_df count: 5


BROADCAST JOIN 


In [0]:
df = spark.read.csv(
    "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv",
    header=True,
    inferSchema=True
)

df.show(5)


+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|_c0|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  1| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
|  2| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
|  3| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
|  4| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
|  5| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows


In [0]:
df_large1 = df.select("cut", "color", "price")
df_large1.show(5)


+-------+-----+-----+
|    cut|color|price|
+-------+-----+-----+
|  Ideal|    E|  326|
|Premium|    E|  326|
|   Good|    E|  327|
|Premium|    I|  334|
|   Good|    J|  335|
+-------+-----+-----+
only showing top 5 rows


In [0]:
df_large2 = df.select("cut", "clarity", "carat")
df_large2.show(5)


+-------+-------+-----+
|    cut|clarity|carat|
+-------+-------+-----+
|  Ideal|    SI2| 0.23|
|Premium|    SI1| 0.21|
|   Good|    VS1| 0.23|
|Premium|    VS2| 0.29|
|   Good|    SI2| 0.31|
+-------+-------+-----+
only showing top 5 rows


In [0]:
joined = df_large1.join(df_large2, "cut", "inner")
joined.show(10)


+---------+-----+-----+-------+-----+
|      cut|color|price|clarity|carat|
+---------+-----+-----+-------+-----+
|    Ideal|    D| 2757|    SI2| 0.23|
|  Premium|    H| 2757|    SI1| 0.21|
|     Good|    D| 2757|    VS1| 0.23|
|  Premium|    H| 2757|    VS2| 0.29|
|     Good|    D| 2757|    SI2| 0.31|
|Very Good|    D| 2757|   VVS2| 0.24|
|Very Good|    D| 2757|   VVS1| 0.24|
|Very Good|    D| 2757|    SI1| 0.26|
|     Fair|    D| 2747|    VS2| 0.22|
|Very Good|    D| 2757|    VS1| 0.23|
+---------+-----+-----+-------+-----+
only showing top 10 rows
