In [None]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("hw_pyspark_env_demo")
         .master("local[*]")
         .config("spark.sql.shuffle.partitions", "8")
         .config("spark.driver.memory", "4g")
         .getOrCreate())
sc = spark.sparkContext

print("Spark version:", sc.version)
print("Master:", sc.master)
print("Shuffle partitions:", spark.conf.get("spark.sql.shuffle.partitions"))
print("Driver memory:", spark.conf.get("spark.driver.memory"))

spark.createDataFrame([(1,"test")], ["id","msg"]).show()


In [None]:
# Getting dataset from URL
!mkdir -p data

TRACKS_URL = "https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/main/dataset.csv?download=true"

!wget -q -O data/spotify_tracks.csv "$TRACKS_URL"
!ls -lh data

In [None]:
# Cleaning commas in dataset
!grep -vi '".*,.*"' data/spotify_tracks.csv > data/clean_spotify_tracks.csv

In [None]:
# Reading dataset into "tracks"
tracks = (spark.read.option("header", True).option("inferSchema", True)
          .csv("data/clean_spotify_tracks.csv")
          .select("track_id", "artists", "album_name", "track_name", "popularity", "duration_ms", "explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature", "track_genre"))

print("Tracks schema:"); tracks.printSchema()
tracks.show(5, truncate=False)

In [None]:
tracks.describe()

In [None]:
# Most Frequent Danceability
import pyspark.sql.functions as F
freq_danc = (tracks.groupBy("danceability")
               .agg(F.count("*").alias("freq"))
               .orderBy(F.desc("freq"))
               .limit(50))
freq_danc.show(50, truncate=False)

In [None]:
import pyspark.sql.functions as F

genre_stats = (tracks.groupBy("track_genre")
               .agg(
                   F.count("*").alias("track_count"),
                   F.avg("popularity").alias("avg_popularity"),
                   F.avg("danceability").alias("avg_danceability"),
                   F.avg("energy").alias("avg_energy"),
                   F.avg("duration_ms").alias("avg_duration_ms")
               )
               .orderBy(F.desc("avg_popularity"))
               .limit(20))

print("Top 20 Genres by Average Popularity:")
genre_stats.show(20, truncate=False)