In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder
    .appName("Dynamic Partition Pruning example")
    .master("local[*]")
    .config("spark.driver.memory" , "10g")
    .getOrCreate()
)

In [3]:
spark

In [4]:
# Reading the Csv files

listening_activity = spark.read.csv("data/partition_raw/Spotify_Listening_Activity.csv", header =True, inferSchema =True)
songs = spark.read.csv("data/partition_raw/Spotify_songs.csv", header = True, inferSchema = True)

In [5]:
listening_activity.printSchema() # Printing the schema for listening_activity dataframe

root
 |-- activity_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- listen_date: timestamp (nullable = true)
 |-- listen_duration: integer (nullable = true)



In [6]:
songs.printSchema() # Printing the schema for songs dataframe

root
 |-- song_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- release_date: timestamp (nullable = true)



In [7]:
from pyspark.sql.functions import to_date


listening_activity = (
    listening_activity
    .withColumnRenamed("listen_date","listen_time")
    .withColumn("listen_date", to_date("listen_time","yyyy-MM-dd HH:mm:ss.SSSSSS"))
    
)

listening_activity.printSchema()

root
 |-- activity_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- listen_time: timestamp (nullable = true)
 |-- listen_duration: integer (nullable = true)
 |-- listen_date: date (nullable = true)



In [8]:
listening_activity.show()

+-----------+-------+--------------------+---------------+-----------+
|activity_id|song_id|         listen_time|listen_duration|listen_date|
+-----------+-------+--------------------+---------------+-----------+
|          1|     12|2023-06-27 10:15:...|             69| 2023-06-27|
|          2|     44|2023-06-27 10:15:...|            300| 2023-06-27|
|          3|     75|2023-06-27 10:15:...|             73| 2023-06-27|
|          4|     48|2023-06-27 10:15:...|            105| 2023-06-27|
|          5|     10|2023-06-27 10:15:...|            229| 2023-06-27|
|          6|     82|2023-06-27 10:15:...|             35| 2023-06-27|
|          7|     64|2023-06-27 10:15:...|            249| 2023-06-27|
|          8|     96|2023-06-27 10:15:...|            211| 2023-06-27|
|          9|     52|2023-06-27 10:15:...|             99| 2023-06-27|
|         10|     21|2023-06-27 10:15:...|            181| 2023-06-27|
|         11|      4|2023-06-27 10:15:...|            175| 2023-06-27|
|     

In [9]:
(
    listening_activity
    .write
    .partitionBy("listen_date")
    .mode("overwrite")
    .parquet("data/partiton_data/listening_activity/")
)

                                                                                

In [10]:
# Lets try to repartition

(
    listening_activity
    .repartition(3)
    .write
    .partitionBy("listen_date")
    .mode("overwrite")
    .parquet("date/partition_data/listening_activity_rp/")
)

                                                                                

In [11]:
result_df = spark.read.parquet("date/partition_data/listening_activity_rp/")

                                                                                

In [12]:
result_df.printSchema()

root
 |-- activity_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- listen_time: timestamp (nullable = true)
 |-- listen_duration: integer (nullable = true)
 |-- listen_date: date (nullable = true)



In [13]:
result_df.rdd.getNumPartitions()

20