In [1]:
# Importing necessary libraries for Spark session, DataFrame operations, and ML modeling
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer

In [2]:
# Initialize SparkSession
spark = SparkSession.builder.appName("MusicMoodClassification").getOrCreate()

In [3]:
# Load the dataset
file_path = "data_moods.csv" 
data = spark.read.csv(file_path, header=True, inferSchema=True)
data.show(5)

+--------------------+--------------------+--------------+--------------------+-------------------+----------+------+-------------------+-------------------+------------------+--------------------+-------------------+------------------+--------+-----------+------------------+---+--------------+---------+
|                name|               album|        artist|                  id|       release_date|popularity|length|       danceability|       acousticness|            energy|    instrumentalness|           liveness|           valence|loudness|speechiness|             tempo|key|time_signature|     mood|
+--------------------+--------------------+--------------+--------------------+-------------------+----------+------+-------------------+-------------------+------------------+--------------------+-------------------+------------------+--------+-----------+------------------+---+--------------+---------+
|                1999|                1999|        Prince|2H7PHVdQ3mXqEHXcv...|198

In [4]:
# Fill cleaning and preprocessing
data = data.na.fill(0)
data = data.drop("mood")
# Performing summary of numeric columns
data.select("danceability", "energy", "valence", "loudness").describe().show()

+-------+-------------------+------------------+-------------------+-------------------+
|summary|       danceability|            energy|            valence|           loudness|
+-------+-------------------+------------------+-------------------+-------------------+
|  count|                686|               686|                686|                686|
|   mean| 0.5005527696793004| 0.507693469387755|0.34243760932944606|-11.531020408163274|
| stddev|0.15895460348923499|0.3264900658301197| 0.2523038620731955|  7.468628623052022|
|    min|             0.0789|           0.00129|             0.0353|            -42.018|
|    max|              0.941|             0.994|              0.977|              1.342|
+-------+-------------------+------------------+-------------------+-------------------+



In [5]:
# Feature Engineering
data = data.withColumn(
    "mood",
    when((col("valence") > 0.6) & (col("danceability") > 0.6), "Happy")
    .when((col("valence") < 0.4) & (col("acousticness") > 0.5), "Sad")
    .when((col("energy") > 0.7) & (col("tempo") > 120), "Energetic")
    .otherwise("Calm")
)
data.show(10)

+--------------------+--------------------+--------------------+--------------------+-------------------+----------+------+-------------------+-------------------+------------------+--------------------+-------------------+------------------+--------+-------------------+------------------+---+--------------+---------+
|                name|               album|              artist|                  id|       release_date|popularity|length|       danceability|       acousticness|            energy|    instrumentalness|           liveness|           valence|loudness|        speechiness|             tempo|key|time_signature|     mood|
+--------------------+--------------------+--------------------+--------------------+-------------------+----------+------+-------------------+-------------------+------------------+--------------------+-------------------+------------------+--------+-------------------+------------------+---+--------------+---------+
|                1999|                19

In [6]:
# Categoriccal Enconding and Assembling
indexer = StringIndexer(inputCol="mood", outputCol="mood_index")
data = indexer.fit(data).transform(data)

feature_columns = ["danceability", "energy", "valence", "acousticness", "tempo", "loudness"]
if "features" in data.columns:
    data = data.drop("features")
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

In [7]:
# Building and Evaulating a Random Forest Classifier model
train_data, test_data = data.randomSplit([0.8, 0.2])

rf = RandomForestClassifier(labelCol="mood_index", featuresCol="features")
model = rf.fit(train_data)

predictions = model.transform(test_data)
predictions.select("mood", "mood_index", "prediction").show()

predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="mood_index", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

+---------+----------+----------+
|     mood|mood_index|prediction|
+---------+----------+----------+
|      Sad|       0.0|       0.0|
|     Calm|       1.0|       1.0|
|Energetic|       2.0|       2.0|
|     Calm|       1.0|       1.0|
|    Happy|       3.0|       3.0|
|Energetic|       2.0|       2.0|
|     Calm|       1.0|       1.0|
|Energetic|       2.0|       2.0|
|Energetic|       2.0|       2.0|
|      Sad|       0.0|       0.0|
|      Sad|       0.0|       0.0|
|      Sad|       0.0|       0.0|
|     Calm|       1.0|       1.0|
|     Calm|       1.0|       1.0|
|Energetic|       2.0|       2.0|
|      Sad|       0.0|       0.0|
|Energetic|       2.0|       2.0|
|     Calm|       1.0|       1.0|
|      Sad|       0.0|       0.0|
|      Sad|       0.0|       0.0|
+---------+----------+----------+
only showing top 20 rows

Accuracy: 0.9705882352941176


In [8]:
# Recommender function based on mood
def recommend_songs(input_mood, top_n=5):
    return data.filter(col("mood") == input_mood).orderBy(col("popularity").desc()).limit(top_n)

# Example
happy_songs = recommend_songs("Happy")
happy_songs.show()

+--------------------+--------------------+-----------------+--------------------+-------------------+----------+------+------------------+------------------+-------------------+--------------------+--------+-------+------------------+-----------+-------+---+--------------+-----+----------+--------------------+
|                name|               album|           artist|                  id|       release_date|popularity|length|      danceability|      acousticness|             energy|    instrumentalness|liveness|valence|          loudness|speechiness|  tempo|key|time_signature| mood|mood_index|            features|
+--------------------+--------------------+-----------------+--------------------+-------------------+----------+------+------------------+------------------+-------------------+--------------------+--------+-------+------------------+-----------+-------+---+--------------+-----+----------+--------------------+
|     Pumped Up Kicks|             Torches|Foster The People|

In [9]:
# Stop SparkSession
spark.stop()