<a href="https://colab.research.google.com/github/rani-sikdar/PySpark/blob/main/pyspark_intermediate_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### working with dates and time

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_date, current_timestamp, date_format, year, month, dayofmonth,
    dayofweek, dayofyear, weekofyear, quarter, date_add, date_sub,
    to_date, to_timestamp, datediff, months_between, next_day, last_day
)


In [2]:
sess = SparkSession.builder.appName('demo').getOrCreate()
sess

In [3]:
current_date()

Column<'current_date()'>

In [5]:
df = sess.range(1).withColumn("current_date", current_date()).withColumn("current_timestamp", current_timestamp())
df.show()


+---+------------+--------------------+
| id|current_date|   current_timestamp|
+---+------------+--------------------+
|  0|  2025-09-25|2025-09-25 04:05:...|
+---+------------+--------------------+



In [6]:
df = sess.range(1).withColumn("current_date", current_date()).withColumn("current_timestamp", current_timestamp())
df.show(truncate=False)

+---+------------+-------------------------+
|id |current_date|current_timestamp        |
+---+------------+-------------------------+
|0  |2025-09-25  |2025-09-25 04:06:17.79182|
+---+------------+-------------------------+



In [7]:
df_dates = sess.createDataFrame([("2025-09-25",)], ["date_str"])
df_dates = df_dates.withColumn("date", to_date("date_str"))
df_dates.select(
    "date",
    year("date").alias("year"),
    month("date").alias("month"),
    dayofmonth("date").alias("day"),
    weekofyear("date").alias("week"),
    quarter("date").alias("quarter")
).show()


+----------+----+-----+---+----+-------+
|      date|year|month|day|week|quarter|
+----------+----+-----+---+----+-------+
|2025-09-25|2025|    9| 25|  39|      3|
+----------+----+-----+---+----+-------+



In [8]:
df_dates.select(date_format("date", "yyyy/MM/dd").alias("formatted_date")).show()

+--------------+
|formatted_date|
+--------------+
|    2025/09/25|
+--------------+



In [9]:
df_dates.select(
    next_day("date", "Sunday").alias("next_sunday"),
    last_day("date").alias("month_end")
).show()


+-----------+----------+
|next_sunday| month_end|
+-----------+----------+
| 2025-09-28|2025-09-30|
+-----------+----------+



In [10]:
from pyspark.sql.functions import to_date, to_timestamp, unix_timestamp, from_unixtime

In [12]:
date = [("2025-09-08 14:35:00",)]
df = sess.createDataFrame(date, ["date_str"])

In [13]:
df.show()

+-------------------+
|           date_str|
+-------------------+
|2025-09-08 14:35:00|
+-------------------+



In [18]:
df_converted = df.withColumn("as_date", to_date("date_str")) \
                 .withColumn("as_timestamp", to_timestamp("date_str"))\
                 .withColumn("as_timestamp_2", to_timestamp("date_str", "dd/MM/yyyy hh:mm a"))\
                 .withColumn("as_unix_timestamp", unix_timestamp("date_str"))\
                 .withColumn("as_human_timestamp", from_unixtime("as_unix_timestamp"))

df_converted.show(truncate=False)

+-------------------+----------+-------------------+--------------+-----------------+-------------------+
|date_str           |as_date   |as_timestamp       |as_timestamp_2|as_unix_timestamp|as_human_timestamp |
+-------------------+----------+-------------------+--------------+-----------------+-------------------+
|2025-09-08 14:35:00|2025-09-08|2025-09-08 14:35:00|NULL          |1757342100       |2025-09-08 14:35:00|
+-------------------+----------+-------------------+--------------+-----------------+-------------------+



### date arthmetic

In [19]:
from pyspark.sql.functions import to_date, date_add, date_sub

In [20]:
df = sess.createDataFrame([("2025-09-08",)], ["date_str"])
df = df.withColumn("date", to_date("date_str"))
df.show()

+----------+----------+
|  date_str|      date|
+----------+----------+
|2025-09-08|2025-09-08|
+----------+----------+



In [21]:
df_arithmetic = df.select(
    "date",
    date_add("date", 7).alias("add_7_days"),
    date_sub("date", 7).alias("minus_7_days")
)
df_arithmetic.show()


+----------+----------+------------+
|      date|add_7_days|minus_7_days|
+----------+----------+------------+
|2025-09-08|2025-09-15|  2025-09-01|
+----------+----------+------------+



In [22]:
# datediff(end_date, start_date)

In [28]:
from pyspark.sql.functions import datediff

df2 = sess.createDataFrame([("2025-09-08", "2025-10-01")], ["start_date", "end_date"])
df2.show()

+----------+----------+
|start_date|  end_date|
+----------+----------+
|2025-09-08|2025-10-01|
+----------+----------+



In [30]:
df2 = df2.withColumn("start", to_date("start_date")) \
                 .withColumn("end", to_date("end_date"))
df2.show()

+----------+----------+----------+----------+
|start_date|  end_date|     start|       end|
+----------+----------+----------+----------+
|2025-09-08|2025-10-01|2025-09-08|2025-10-01|
+----------+----------+----------+----------+



In [31]:

df2 = df2.withColumn("days_between", datediff("end", "start"))
df2.show()

+----------+----------+----------+----------+------------+
|start_date|  end_date|     start|       end|days_between|
+----------+----------+----------+----------+------------+
|2025-09-08|2025-10-01|2025-09-08|2025-10-01|          23|
+----------+----------+----------+----------+------------+



In [33]:
df2.select("start_date", "end_date", "days_between").show()

+----------+----------+------------+
|start_date|  end_date|days_between|
+----------+----------+------------+
|2025-09-08|2025-10-01|          23|
+----------+----------+------------+



In [35]:
from pyspark.sql.functions import last_day, next_day

df3 = df2.select(
    last_day("start_date").alias("month_end"),
    next_day("start_date", "Friday").alias("next_friday")
)
df3.show()


+----------+-----------+
| month_end|next_friday|
+----------+-----------+
|2025-09-30| 2025-09-12|
+----------+-----------+



In [None]:
"""
Use Case:
  ETL Pipelines – Partitioning and Filtering Data

Scenario:
  You receive daily logs in a data lake (/logs/2025/09/08/) and want to process only yesterday’s data.

Solution:
"""
from pyspark.sql.functions import current_date, date_sub

# Filter only yesterday's data
yesterday = date_sub(current_date(), 1)
logs_df = sess.read.parquet("/content/house-price.parquet")  # sample log attached here
logs_df = logs_df.filter(logs_df.event_date == yesterday)


In [None]:
"""
Use Case:
  Optimizes ETL by processing only the required partition.

Scenario:
  Time-Series Analytics – Trend Over Time
  You want to calculate daily active users (DAU) for the last 30 days.

Solution:
"""
from pyspark.sql.functions import current_date, date_sub

start_date = date_sub(current_date(), 30)  # last 30 days date
dau_df = logs_df.filter(logs_df.login_date >= start_date) \
                  .groupBy("login_date") \
                  .count() \
                  .orderBy("login_date")


In [37]:
"""
Use Case:
  Rolling windows for reporting, dashboards, and analytics.

Scenario:
  SLA Monitoring – Calculating Processing Time
  You want to check how long each job took to complete.

Solution:
"""
from pyspark.sql.functions import unix_timestamp

jobs_df = sess.createDataFrame([
    ("2025-09-08 10:00:00", "2025-09-08 14:30:00"),
    ("2025-09-08 12:15:00", "2025-09-08 13:00:00")
], ["start_time", "end_time"])

jobs_df = jobs_df.withColumn("duration_minutes",
    (unix_timestamp("end_time") - unix_timestamp("start_time")) / 60
)
jobs_df.show()


+-------------------+-------------------+----------------+
|         start_time|           end_time|duration_minutes|
+-------------------+-------------------+----------------+
|2025-09-08 10:00:00|2025-09-08 14:30:00|           270.0|
|2025-09-08 12:15:00|2025-09-08 13:00:00|            45.0|
+-------------------+-------------------+----------------+



In [39]:
"""
Use Case:
  SLA compliance, performance monitoring, and alerts.

Scenario:
   Subscription Billing – Next Renewal Date.
   Each user has a subscription date. You need to calculate their next billing date (every 30 days).

Solution:
"""
from pyspark.sql.functions import to_date, date_add

dff = sess.createDataFrame([
    ("2025-09-08",), ("2025-09-10",)
], ["subscription_date"])

# dff.show()

dff = dff.withColumn("next_billing_date", date_add("subscription_date", 30))
dff.show()

+-----------------+-----------------+
|subscription_date|next_billing_date|
+-----------------+-----------------+
|       2025-09-08|       2025-10-08|
|       2025-09-10|       2025-10-10|
+-----------------+-----------------+



In [42]:
"""
Use Case:
  Data Retention – Archiving Old Data

Scenario:
   Archive data older than 6 months.

Solution:
"""

from pyspark.sql.functions import add_months, current_date

cutoff_date = add_months(current_date(), -6)
archived_df = dff.filter(dff.subscription_date < cutoff_date)
archived_df.show()


+-----------------+-----------------+
|subscription_date|next_billing_date|
+-----------------+-----------------+
+-----------------+-----------------+



In [43]:
sess

In [4]:
data = [
    ("East", "ProductA", 100),
    ("East", "ProductB", 200),
    ("West", "ProductA", 150),
    ("West", "ProductB", 300),
    ("West", "ProductA", 250)
]
df = sess.createDataFrame(data, ["region", "product","sales"])
df.show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
+------+--------+-----+



In [49]:
df.groupBy("region").count().show()

+------+-----+
|region|count|
+------+-----+
|  East|    2|
|  West|    3|
+------+-----+



In [50]:
df.groupBy("region").sum("sales").show()

+------+----------+
|region|sum(sales)|
+------+----------+
|  East|       300|
|  West|       700|
+------+----------+



In [51]:
from pyspark.sql.functions import sum, avg, count, max, min, mean, countDistinct

In [56]:
df.groupBy("region", "product").agg(
    sum("sales").alias("total_sales"),
    avg("sales").alias("avg_sales"),
).show()


+------+--------+-----------+---------+
|region| product|total_sales|avg_sales|
+------+--------+-----------+---------+
|  East|ProductB|        200|    200.0|
|  East|ProductA|        100|    100.0|
|  West|ProductA|        400|    200.0|
|  West|ProductB|        300|    300.0|
+------+--------+-----------+---------+



In [53]:
df.groupBy("region").agg(
    countDistinct("product").alias("distinct_products"),
    max("sales").alias("max_sales"),
    min("sales").alias("min_sales"),
    avg("sales").alias("mean_sales")
).show()


+------+-----------------+---------+---------+------------------+
|region|distinct_products|max_sales|min_sales|        mean_sales|
+------+-----------------+---------+---------+------------------+
|  East|                2|      200|      100|             150.0|
|  West|                2|      300|      150|233.33333333333334|
+------+-----------------+---------+---------+------------------+



In [54]:
df.groupBy("region").agg(
    {"sales": "sum", "sales": "avg"}
).show()

+------+------------------+
|region|        avg(sales)|
+------+------------------+
|  East|             150.0|
|  West|233.33333333333334|
+------+------------------+



In [61]:
df.groupBy("region").sum("sales").orderBy(sum("sales"), ascending=False).show()

+------+----------+
|region|sum(sales)|
+------+----------+
|  West|       700|
|  East|       300|
+------+----------+



In [58]:

df.toPandas().tail(1)

Unnamed: 0,region,product,sales
4,West,ProductA,250


In [60]:

df.toPandas().shape

(5, 3)

In [63]:
df.count()

5

### window functions

In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number, col, to_date


In [4]:
data = [
    ("East", "ProductA", 100),
    ("East", "ProductB", 200),
    ("West", "ProductA", 150),
    ("West", "ProductB", 300),
    ("West", "ProductA", 250)
]
df = sess.createDataFrame(data, ["region", "product", "sales"])
df.show()


+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
+------+--------+-----+



In [5]:
window_spec = Window.partitionBy("region").orderBy(col("sales").desc())

In [6]:
window_spec

<pyspark.sql.window.WindowSpec at 0x7deef409a870>

In [8]:
df.withColumn("rank", rank().over(window_spec)).show()   # rank

+------+--------+-----+----+
|region| product|sales|rank|
+------+--------+-----+----+
|  East|ProductB|  200|   1|
|  East|ProductA|  100|   2|
|  West|ProductB|  300|   1|
|  West|ProductA|  250|   2|
|  West|ProductA|  150|   3|
+------+--------+-----+----+



In [9]:
df.withColumn("dense_rank", dense_rank().over(window_spec)).show()   # dense rank

+------+--------+-----+----------+
|region| product|sales|dense_rank|
+------+--------+-----+----------+
|  East|ProductB|  200|         1|
|  East|ProductA|  100|         2|
|  West|ProductB|  300|         1|
|  West|ProductA|  250|         2|
|  West|ProductA|  150|         3|
+------+--------+-----+----------+



In [10]:
df.withColumn("row_num", row_number().over(window_spec)).show()   # row number

+------+--------+-----+-------+
|region| product|sales|row_num|
+------+--------+-----+-------+
|  East|ProductB|  200|      1|
|  East|ProductA|  100|      2|
|  West|ProductB|  300|      1|
|  West|ProductA|  250|      2|
|  West|ProductA|  150|      3|
+------+--------+-----+-------+



In [11]:
""" 1. Top N Products per Region

Scenario: Get the top 2 best-selling products in each region based on sales.

Use Case: E-commerce dashboards, sales reports.
"""
window_spec = Window.partitionBy("region").orderBy(col("sales").desc())

top_n_products = df.withColumn("row_num", row_number().over(window_spec)).filter(col("row_num") <= 2)

top_n_products.show()

+------+--------+-----+-------+
|region| product|sales|row_num|
+------+--------+-----+-------+
|  East|ProductB|  200|      1|
|  East|ProductA|  100|      2|
|  West|ProductB|  300|      1|
|  West|ProductA|  250|      2|
+------+--------+-----+-------+



In [13]:
""" 2. Customer Retention – Latest Purchase Per Customer

Scenario: You have multiple transactions per customer and want to find the latest transaction.

Use Case: CRM analytics, churn prediction, personalized marketing.
"""
transactions = [
    (1, "2025-09-01", 100),
    (1, "2025-09-05", 150),
    (2, "2025-09-02", 200),
    (2, "2025-09-06", 300)
]
df_txn = sess.createDataFrame(transactions, ["customer_id", "purchase_date", "amount"])

window_spec = Window.partitionBy("customer_id").orderBy(col("purchase_date").desc())

latest_transaction = df_txn.withColumn("rn", row_number().over(window_spec)) \
                   .filter(col("rn") == 1)

latest_transaction.show()


+-----------+-------------+------+---+
|customer_id|purchase_date|amount| rn|
+-----------+-------------+------+---+
|          1|   2025-09-05|   150|  1|
|          2|   2025-09-06|   300|  1|
+-----------+-------------+------+---+



In [15]:
"""3. Ranking Customers by Spending

Scenario: Rank customers within each region based on total spend.

Use Case: Loyalty programs, premium customer identification.
"""
from pyspark.sql.functions import sum as sum_

# aggregate
customer_spend = df.groupBy("region", "product").agg(sum_("sales").alias("total_sales"))

window_spec = Window.partitionBy("region").orderBy(col("total_sales").desc())

ranked_customers = customer_spend.withColumn("rank", rank().over(window_spec))
ranked_customers.show()


+------+--------+-----------+----+
|region| product|total_sales|rank|
+------+--------+-----------+----+
|  East|ProductB|        200|   1|
|  East|ProductA|        100|   2|
|  West|ProductA|        400|   1|
|  West|ProductB|        300|   2|
+------+--------+-----------+----+



In [17]:
df.show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
+------+--------+-----+



In [18]:
""" 4. Running Totals (Cumulative Sums)

Scenario: Calculate cumulative sales per region over time.

Use Case: Revenue growth trends, inventory planning.
"""
from pyspark.sql.functions import sum as sum_

window_spec = Window.partitionBy("region").orderBy("sales") \
                    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running = df.withColumn("cumulative_sales", sum_("sales").over(window_spec))
df_running.show()

+------+--------+-----+----------------+
|region| product|sales|cumulative_sales|
+------+--------+-----+----------------+
|  East|ProductA|  100|             100|
|  East|ProductB|  200|             300|
|  West|ProductA|  150|             150|
|  West|ProductA|  250|             400|
|  West|ProductB|  300|             700|
+------+--------+-----+----------------+



In [19]:
transactions = [
    (1, "2025-09-01", 100),
    (1, "2025-09-05", 150),
    (2, "2025-09-02", 200),
    (2, "2025-09-06", 300)
]
df_txn = sess.createDataFrame(transactions, ["customer_id", "purchase_date", "amount"])


In [20]:
""" 5. Identifying First & Last Events (Sessionization)

Scenario: Identify the first login and last login per user.

Use Case: User session analysis, fraud detection.
"""
from pyspark.sql.functions import first, last

window_spec = Window.partitionBy("customer_id").orderBy("purchase_date")

dff = df_txn.withColumn("first_txn", first("purchase_date").over(window_spec)) \
               .withColumn("last_txn", last("purchase_date").over(window_spec))

dff.show()

+-----------+-------------+------+----------+----------+
|customer_id|purchase_date|amount| first_txn|  last_txn|
+-----------+-------------+------+----------+----------+
|          1|   2025-09-01|   100|2025-09-01|2025-09-01|
|          1|   2025-09-05|   150|2025-09-01|2025-09-05|
|          2|   2025-09-02|   200|2025-09-02|2025-09-02|
|          2|   2025-09-06|   300|2025-09-02|2025-09-06|
+-----------+-------------+------+----------+----------+



In [21]:
data1 = [("A", 1), ("B", 2), ("C", 3)]
data2 = [("A", "Apple"), ("B", "Banana"), ("D", "Dates")]

df1 = sess.createDataFrame(data1, ["id","value"])
df2 = sess.createDataFrame(data2, ["id","name"])

In [26]:
df1.show(), df2.show()

+---+-----+
| id|value|
+---+-----+
|  A|    1|
|  B|    2|
|  C|    3|
+---+-----+

+---+------+
| id|  name|
+---+------+
|  A| Apple|
|  B|Banana|
|  D| Dates|
+---+------+



(None, None)

In [23]:
inner_join = df1.join(df2, on="id",how="inner")
inner_join.show()

+---+-----+------+
| id|value|  name|
+---+-----+------+
|  A|    1| Apple|
|  B|    2|Banana|
+---+-----+------+



In [24]:
left_join = df1.join(df2, on="id", how="left")
left_join.show()

+---+-----+------+
| id|value|  name|
+---+-----+------+
|  A|    1| Apple|
|  B|    2|Banana|
|  C|    3|  NULL|
+---+-----+------+



In [27]:
right_join = df1.join(df2, on="id", how="right")
right_join.show()

+---+-----+------+
| id|value|  name|
+---+-----+------+
|  A|    1| Apple|
|  B|    2|Banana|
|  D| NULL| Dates|
+---+-----+------+



In [28]:
full_outer = df1.join(df2, on="id", how="outer")
full_outer.show()

+---+-----+------+
| id|value|  name|
+---+-----+------+
|  A|    1| Apple|
|  B|    2|Banana|
|  C|    3|  NULL|
|  D| NULL| Dates|
+---+-----+------+



### Broadcast Joins in PySpark
A Broadcast Join is a special type of join in PySpark designed to speed up joins when one of the DataFrames is small enough to fit in memory on all worker nodes.

In [30]:
# result = df_large.join(broadcast(df_small), on="id", how="inner")

from pyspark.sql.functions import broadcast

spark = SparkSession.builder.appName("BroadcastJoinExample").getOrCreate()

data_large = [(i, f"Item_{i}") for i in range(1, 1000001)]
df_large = spark.createDataFrame(data_large, ["id", "name"])

df_large.show()

+---+-------+
| id|   name|
+---+-------+
|  1| Item_1|
|  2| Item_2|
|  3| Item_3|
|  4| Item_4|
|  5| Item_5|
|  6| Item_6|
|  7| Item_7|
|  8| Item_8|
|  9| Item_9|
| 10|Item_10|
| 11|Item_11|
| 12|Item_12|
| 13|Item_13|
| 14|Item_14|
| 15|Item_15|
| 16|Item_16|
| 17|Item_17|
| 18|Item_18|
| 19|Item_19|
| 20|Item_20|
+---+-------+
only showing top 20 rows



In [31]:
data_small = [(1, "Category_A"), (2, "Category_B"), (3, "Category_C")]
df_small = spark.createDataFrame(data_small, ["id", "category"])

# Broadcast join
joined_df = df_large.join(broadcast(df_small), on="id", how="inner")

In [32]:
joined_df.show()

+---+------+----------+
| id|  name|  category|
+---+------+----------+
|  1|Item_1|Category_A|
|  2|Item_2|Category_B|
|  3|Item_3|Category_C|
+---+------+----------+



In [33]:
# Performance Tips for Joins

# 1. Use Broadcast Joins for Small Tables
from pyspark.sql.functions import broadcast
df_large.join(broadcast(df_small), on="id", how="inner")


DataFrame[id: bigint, name: string, category: string]

In [34]:
# 2. Repartition on Join Keys -- If both DataFrames are large, repartition them on the join key to reduce shuffle skew.

df1 = df1.repartition(100, "id")
df2 = df2.repartition(100, "id")
joined_df = df1.join(df2, on="id", how="inner")

In [35]:
# 3. Avoid Skew in Join Keys
# 4. Cache or Persist Frequently Used Data

df.cache()
df.persist()

DataFrame[region: string, product: string, sales: bigint]

In [37]:
df.show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
+------+--------+-----+



In [39]:
# 5. Filter early
df.filter(df.region=='East').show()
df.select("product").distinct().show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
+------+--------+-----+

+--------+
| product|
+--------+
|ProductB|
|ProductA|
+--------+



In [42]:
# 6. Prefer Equi-Joins over Cross Joins
# 7. Optimize Join Type
# 8. Adjust Spark Configurations for Large Joins

spark.conf.set("spark.sql.shuffle.partitions", 400) # default --> 200


In [44]:
df.show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  East|ProductA|  100|
|  East|ProductB|  200|
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
+------+--------+-----+



In [48]:
# 9. Use Bucketing for Repeated Joins
# For very large datasets joined frequently: Bucket both DataFrames by the same key and number of buckets.

df.write.bucketBy(100, "region").saveAsTable("bucketed_table2")

In [49]:
res = spark.sql("select * from bucketed_table2")
res.show()

+------+--------+-----+
|region| product|sales|
+------+--------+-----+
|  West|ProductA|  150|
|  West|ProductB|  300|
|  West|ProductA|  250|
|  East|ProductA|  100|
|  East|ProductB|  200|
+------+--------+-----+



In [50]:
# 10. Monitor Using Spark UI
# After performing joins, check Spark UI → SQL tab → DAG

### Error Handling & Debugging in PySpark


In [51]:
spark.sparkContext.setLogLevel("INFO")   # or "DEBUG"  #  ERROR, WARN, INFO, DEBUG

DataFrame[region: string, product: string, sales: bigint]

In [None]:
df.limit(100).show()

In [None]:
df.columns

In [None]:
# check null
df.filter(col("id").isNull()).show()

In [2]:
# changing log levels
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")  # Options: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN


In [4]:
# custom logging
import logging
from pyspark.sql import SparkSession

# Configure logging
logging.basicConfig(
    format='%(asctime)s %(levelname)s %(message)s',
    level=logging.INFO  # Set to DEBUG for more details
)
logger = logging.getLogger(__name__)

# Start Spark
spark = SparkSession.builder.appName("LoggingExample").getOrCreate()
logger.info("Spark session started")


In [6]:

df = spark.range(10)
logger.debug(f"Schema: {df.schema}")
logger.info(f"Row count: {df.count()}")


You can check logger messages by looking at the standard output and standard error streams of your Spark application. The verbosity is controlled by the log level.

Here's how you can set the log level and see the output:

In [7]:
# Set the log level (e.g., to INFO to see more messages)
spark.sparkContext.setLogLevel("INFO")

# You can then run your code, and messages at or above the INFO level will be printed.
# For example, running the count operation again will show INFO level messages related to job execution.
logger.info("Running count operation after setting log level to INFO")
df.count()

10