<a href="https://colab.research.google.com/github/shawngobrzy/spotify_recommendation_system/blob/main/CS%20624%20-%20Recommendation%20System%20-%20Shawn%20Bryant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Spotify Dataset](https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs)

In [None]:
!pip install pyspark



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

file = r'/content/gdrive/My Drive/spotify_songs.csv'

Mounted at /content/gdrive


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

#Load the Dataset

In [None]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv(file, header=True, inferSchema=True, sep=',')
df.show()

+--------------------+--------------------+----------------+----------------+--------------------+--------------------+------------------------+-------------+--------------------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+
|            track_id|          track_name|    track_artist|track_popularity|      track_album_id|    track_album_name|track_album_release_date|playlist_name|         playlist_id|playlist_genre|playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|
+--------------------+--------------------+----------------+----------------+--------------------+--------------------+------------------------+-------------+--------------------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+
|6f807x0im

#Check for Missing Values

In [None]:
df.select([sum(col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()

+--------+----------+------------+----------------+--------------+----------------+------------------------+-------------+-----------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+
|track_id|track_name|track_artist|track_popularity|track_album_id|track_album_name|track_album_release_date|playlist_name|playlist_id|playlist_genre|playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|duration_ms|
+--------+----------+------------+----------------+--------------+----------------+------------------------+-------------+-----------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+
|       0|         0|           0|               0|             0|               0|                       0|            0|          0|             0|  

In [None]:
df = df.drop('track_album_id', 'track_album_name', 'track_album_release_date', 'playlist_name', 'playlist_id', 'playlist_subgenre')

In [None]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)
train_df.count(), test_df.count()

(23076, 9757)

In [None]:
df.select(df.columns[:-1]).describe().show()

+-------+--------------------+--------------------+------------------+------------------+--------------------+-------------------+--------------------+-----------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|            track_id|          track_name|      track_artist|  track_popularity|      playlist_genre|       danceability|              energy|              key|          loudness|                mode|        speechiness|       acousticness|   instrumentalness|           liveness|           valence|             tempo|
+-------+--------------------+--------------------+------------------+------------------+--------------------+-------------------+--------------------+-----------------+------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|       

In [None]:
from pyspark.sql.types import DoubleType

for column in ['energy', 'key', 'loudness', 'mode']:
    train_df = train_df.withColumn(column, train_df[column].cast(DoubleType()))
    test_df = test_df.withColumn(column, test_df[column].cast(DoubleType()))

assembler = VectorAssembler(inputCols=train_df.columns[-11:], outputCol='features')
train_vector_df = assembler.transform(train_df)
train_vector_df.show(5)

+--------------------+-------------+---------------+----------------+--------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+
|            track_id|   track_name|   track_artist|track_popularity|playlist_genre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|            features|
+--------------------+-------------+---------------+----------------+--------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+
|0017A6SJgTbfQVU2E...|     Pangarap|Barbie's Cradle|              41|          rock|       0.682| 0.401|2.0| -10.068| 1.0|     0.0236|       0.279|          0.0117|  0.0887|  0.566| 97.091|   235440.0|[0.401,2.0,-10.06...|
|004s3t0ONYlzxII9P...| I Feel Alive|  Steady Rollin|              28|          rock|       0.303|  0.88|9.0|

In [None]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scaler_model = scaler.fit(train_vector_df)
scaled_df = scaler_model.transform(train_vector_df)
scaled_df.select('scaled_features').show(5)

+--------------------+
|     scaled_features|
+--------------------+
|[2.21116831048559...|
|[4.85243918510553...|
|[4.62084549672549...|
|[4.40579421465832...|
|[3.27539644994623...|
+--------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='playlist_genre', outputCol='playlist_genre_index')

indexed_train_df = indexer.fit(scaled_df).transform(scaled_df)
indexed_test_df = indexer.fit(test_df).transform(test_df)

randomforest = RandomForestClassifier(
    seed = 42, featuresCol='scaled_features',
    labelCol='playlist_genre_index', impurity="entropy",
    numTrees=15)
model = randomforest.fit(indexed_train_df)

In [None]:
test_vector_df = assembler.transform(test_df)
scaled_test_df = scaler_model.transform(test_vector_df)
scaled_test_df.show(5)

+--------------------+--------------------+------------------+----------------+--------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+--------------------+
|            track_id|          track_name|      track_artist|track_popularity|playlist_genre|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|            features|     scaled_features|
+--------------------+--------------------+------------------+----------------+--------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+--------------------+
|008MceT31RotUANsK...|         Liquid Blue|The.madpix.project|              24|           pop|       0.659| 0.794|10.0|  -5.644| 0.0|      0.054|     7.61E-4|           0.132|   0.322|  0.852|128.041|   228565.0|[0.794,10.0,-5.64...|[4

In [None]:
reccomendation = model.transform(indexed_train_df)
reccomendation.show(5)

+--------------------+-------------+---------------+----------------+--------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            track_id|   track_name|   track_artist|track_popularity|playlist_genre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|            features|     scaled_features|playlist_genre_index|       rawPrediction|         probability|prediction|
+--------------------+-------------+---------------+----------------+--------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|0017A6SJgTbfQVU2E...|     Pangarap|Barbie's C

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='playlist_genre_index', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(reccomendation)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 46.30%
