In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("hw_pyspark_env_demo")
         .master("local[*]")
         .config("spark.sql.shuffle.partitions", "8")
         .config("spark.driver.memory", "4g")
         .getOrCreate())
sc = spark.sparkContext

print("Spark version:", sc.version)
print("Master:", sc.master)
print("Shuffle partitions:", spark.conf.get("spark.sql.shuffle.partitions"))
print("Driver memory:", spark.conf.get("spark.driver.memory"))

spark.createDataFrame([(1,"test")], ["id","msg"]).show()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/01 23:05:24 WARN Utils: Your hostname, Chloes-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.97 instead (on interface en0)
25/11/01 23:05:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/01 23:05:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 4.0.1
Master: local[*]
Shuffle partitions: 8
Driver memory: 4g


                                                                                

+---+----+
| id| msg|
+---+----+
|  1|test|
+---+----+



In [4]:
# Getting dataset from URL
!mkdir -p data

TRACKS_URL = "https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/main/dataset.csv?download=true"

!wget -q -O data/spotify_tracks.csv "$TRACKS_URL"
!ls -lh data

total 40736
-rw-r--r--  1 chloe  staff   283B Oct 14 14:02 howtodownload.md
-rw-r--r--  1 chloe  staff    19M Nov  1 23:12 spotify_tracks.csv


In [14]:
# Cleaning commas in dataset
!grep -vi '".*,.*"' data/spotify_tracks.csv > data/clean_spotify_tracks.csv

In [15]:
# Reading dataset into "tracks"
tracks = (spark.read.option("header", True).option("inferSchema", True)
          .csv("data/clean_spotify_tracks.csv")
          .select("track_id", "artists", "album_name", "track_name", "popularity", "duration_ms", "explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature", "track_genre"))

print("Tracks schema:"); tracks.printSchema()
tracks.show(5, truncate=False)

Tracks schema:
root
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- track_genre: string (nullable = true)

+----------------------+----------------------+------------------------------------------------------+--------------------------+----------+-----------+

In [11]:
tracks.describe()

DataFrame[summary: string, track_id: string, artists: string, album_name: string, track_name: string, popularity: string, duration_ms: string, explicit: string, danceability: string, energy: string, key: string, loudness: string, mode: string, speechiness: string, acousticness: string, instrumentalness: string, liveness: string, valence: string, tempo: string, time_signature: string, track_genre: string]

In [18]:
# Most Frequent Danceability
import pyspark.sql.functions as F
freq_danc = (tracks.groupBy("danceability")
               .agg(F.count("*").alias("freq"))
               .orderBy(F.desc("freq"))
               .limit(50))
freq_danc.show(50, truncate=False)

+------------+----+
|danceability|freq|
+------------+----+
|0.647       |406 |
|0.609       |331 |
|0.579       |330 |
|0.685       |324 |
|0.602       |316 |
|0.689       |305 |
|0.524       |303 |
|0.598       |294 |
|0.586       |294 |
|0.626       |291 |
|0.631       |290 |
|0.582       |289 |
|0.56        |289 |
|0.607       |287 |
|0.534       |282 |
|0.576       |281 |
|0.545       |281 |
|0.532       |274 |
|0.593       |273 |
|0.627       |272 |
|0.588       |272 |
|0.568       |272 |
|0.687       |270 |
|0.714       |270 |
|0.795       |268 |
|0.671       |268 |
|0.573       |267 |
|0.569       |266 |
|0.639       |266 |
|0.623       |264 |
|0.535       |263 |
|0.653       |261 |
|0.616       |260 |
|0.603       |259 |
|0.497       |259 |
|0.503       |258 |
|0.661       |257 |
|0.596       |257 |
|0.533       |256 |
|0.637       |255 |
|0.679       |255 |
|0.521       |255 |
|0.705       |254 |
|0.546       |253 |
|0.629       |253 |
|0.543       |252 |
|0.601       |252 |
