In [None]:
# Import and set pyspark environment
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

!pip install -q findspark
import findspark
findspark.init()

In [None]:
# Mount Google drive for reading data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Initialize spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
# Read data into dataframe
tracks = spark.read.csv('/content/drive/MyDrive/Big_Data_Project/tracks_ml.csv', header=True, inferSchema=True)
genres = spark.read.csv('/content/drive/MyDrive/Big_Data_Project/genres_clean.csv', header=True, inferSchema=True)
features = spark.read.csv('/content/drive/MyDrive/Big_Data_Project/features_clean.csv', header=True, inferSchema=True)
echonest = spark.read.csv('/content/drive/MyDrive/Big_Data_Project/echonest_clean.csv', header=True, inferSchema=True)

# Genres classification based on 8 features presented in echonest_clean.csv

Note: Running some cells may be quite time-consuming, so it is recommended to just look at the results or only run cells that set and preprocess dataframes and that directly gets the accuracy of a model with pre-tuned hyper-parameters. In other words, try avoid running cells that implement cross-validation.

In [None]:
# Set train dataset and test dataset
tracks_echonest = echonest.join(tracks, on='track_id')
train = tracks_echonest.filter(tracks_echonest['set split'] == "training")
test = tracks_echonest.filter(tracks_echonest['set split'] == "test")

total_train = train.select(train.columns[1:9] + train.columns[14:15]).dropna()
total_test = test.select(test.columns[1:9] + train.columns[14:15]).dropna()

In [None]:
# Preprocess data for classification tasks
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

assembler = VectorAssembler(inputCols=total_train.columns[:8], outputCol="features")
train_assembled = assembler.transform(total_train)
test_assembled = assembler.transform(total_test)

label_indexer = StringIndexer(inputCol="track genre_top", outputCol="label").fit(train_assembled)
train_indexed = label_indexer.transform(train_assembled)
test_indexed = label_indexer.transform(test_assembled)

In [None]:
# Cross-validation for Random Forest
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
param_grid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, range(8,11)) \
    .addGrid(rf.maxBins, range(30,35)) \
    .addGrid(rf.minInstancesPerNode, range(8,13)) \
    .build()
evaluator_rf = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_rf = CrossValidator(estimator=rf, estimatorParamMaps=param_grid_rf, evaluator=evaluator_rf, numFolds=5)
cv_model_rf = cross_val_rf.fit(train_indexed)
best_model_rf = cv_model_rf.bestModel
predictions_rf = best_model_rf.transform(test_indexed)
accuracy_rf = evaluator_rf.evaluate(predictions_rf)
print("Accuracy of Random Forest: {:.2%}".format(accuracy_rf))

Accuracy of Random Forest: 58.07%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42, maxDepth=10, maxBins=31, minInstancesPerNode=10)
rf_model = rf.fit(train_indexed)
predictions = rf_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Random Forest: {:.2%}".format(accuracy))

Accuracy of Random Forest: 58.07%


In [None]:
# Cross-validation for Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=42)
param_grid_dt = ParamGridBuilder() \
    .addGrid(dt.maxDepth, range(5,10)) \
    .addGrid(dt.maxBins, range(30,35)) \
    .addGrid(dt.minInstancesPerNode, range(8,13)) \
    .build()
evaluator_dt = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_dt = CrossValidator(estimator=dt, estimatorParamMaps=param_grid_dt, evaluator=evaluator_dt, numFolds=5)
cv_model_dt = cross_val_dt.fit(train_indexed)
best_model_dt = cv_model_dt.bestModel
predictions_dt = best_model_dt.transform(test_indexed)
accuracy_dt = evaluator_dt.evaluate(predictions_dt)
print("Accuracy of Decision Tree: {:.2%}".format(accuracy_dt))

Accuracy of Decision Tree: 57.48%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=42, maxDepth=7, maxBins=34, minInstancesPerNode=10)
dt_model = dt.fit(train_indexed)
predictions = dt_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Decision Tree: {:.2%}".format(accuracy))

Accuracy of Decision Tree: 57.48%


In [None]:
# Cross-validation for Multilayer Perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier

mlp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", seed=42, maxIter=2000)
param_grid_mlp = ParamGridBuilder() \
    .addGrid(mlp.layers, [[8, 32, 12], [8, 36, 12], [8, 40, 12]]) \
    .build()
evaluator_mlp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_mlp = CrossValidator(estimator=mlp, estimatorParamMaps=param_grid_mlp, evaluator=evaluator_mlp, numFolds=5)
cv_model_mlp = cross_val_mlp.fit(train_indexed)
best_model_mlp = cv_model_mlp.bestModel
predictions_mlp = best_model_mlp.transform(test_indexed)
accuracy_mlp = evaluator_mlp.evaluate(predictions_mlp)
print("Accuracy of Multilayer Perceptron: {:.2%}".format(accuracy_mlp))

Accuracy of Multilayer Perceptron: 53.71%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [8, 36, 12]  # Define layer structure
mlp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", layers=layers, seed=42, maxIter=2000)
mlp_model = mlp.fit(train_indexed)
predictions = mlp_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Multilayer Perceptron: {:.2%}".format(accuracy))

Accuracy of Multilayer Perceptron: 53.71%


In [None]:
# Cross-validation for Logistic Regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", family="multinomial")
param_grid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, [0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]) \
    .addGrid(lr.tol, [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]) \
    .build()
evaluator_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_lr = CrossValidator(estimator=lr, estimatorParamMaps=param_grid_lr, evaluator=evaluator_lr, numFolds=5)
cv_model_lr = cross_val_lr.fit(train_indexed)
best_model_lr = cv_model_lr.bestModel
predictions_lr = best_model_lr.transform(test_indexed)
accuracy_lr = evaluator_lr.evaluate(predictions_lr)
print("Accuracy of Logistic Regression: {:.2%}".format(accuracy_lr))

Accuracy of Logistic Regression: 52.77%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", family="multinomial", regParam=1e-6, tol=1e-4)
lr_model = lr.fit(train_indexed)
predictions = lr_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Logistic Regression: {:.2%}".format(accuracy))

Accuracy of Logistic Regression: 52.77%


# Genres classification based on 140 features presented in features_clean.csv

Note: Running some cells may be quite time-consuming, so it is recommended to just look at the results or only run cells that set and preprocess dataframes and that directly gets the accuracy of a model with pre-tuned hyper-parameters. In other words, try avoid running cells that implement cross-validation.

In [None]:
# Set train dataset and test dataset
tracks_features = features.join(tracks, on='track_id')
train = tracks_features.filter((tracks_features['set split'] == "training") & (tracks_features['set subset'] == "medium"))
test = tracks_features.filter((tracks_features['set split'] == "test") & (tracks_features['set subset'] == "medium"))

total_train = train.select(train.columns[1:141] + train.columns[146:147]).dropna()
total_test = test.select(test.columns[1:141] + train.columns[146:147]).dropna()

In [None]:
# Preprocess data for classification tasks
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

assembler = VectorAssembler(inputCols=total_train.columns[:140], outputCol="features")
train_assembled = assembler.transform(total_train)
test_assembled = assembler.transform(total_test)

label_indexer = StringIndexer(inputCol="track genre_top", outputCol="label").fit(train_assembled)
train_indexed = label_indexer.transform(train_assembled)
test_indexed = label_indexer.transform(test_assembled)

In [None]:
# Cross-validation for Random Forest
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
param_grid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, range(8,11)) \
    .addGrid(rf.maxBins, range(30,35)) \
    .addGrid(rf.minInstancesPerNode, range(8,13)) \
    .build()
evaluator_rf = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_rf = CrossValidator(estimator=rf, estimatorParamMaps=param_grid_rf, evaluator=evaluator_rf, numFolds=5)
cv_model_rf = cross_val_rf.fit(train_indexed)
best_model_rf = cv_model_rf.bestModel
predictions_rf = best_model_rf.transform(test_indexed)
accuracy_rf = evaluator_rf.evaluate(predictions_rf)
print("Accuracy of Random Forest: {:.2%}".format(accuracy_rf))

Accuracy of Random Forest: 63.34%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42, maxDepth=10, maxBins=31, minInstancesPerNode=10)
rf_model = rf.fit(train_indexed)
predictions = rf_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Random Forest: {:.2%}".format(accuracy))

Accuracy of Random Forest: 63.34%


In [None]:
# Cross-validation for Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=42)
param_grid_dt = ParamGridBuilder() \
    .addGrid(dt.maxDepth, range(6,11)) \
    .addGrid(dt.maxBins, range(29,33)) \
    .addGrid(dt.minInstancesPerNode, range(8,13)) \
    .build()
evaluator_dt = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_dt = CrossValidator(estimator=dt, estimatorParamMaps=param_grid_dt, evaluator=evaluator_dt, numFolds=5)
cv_model_dt = cross_val_dt.fit(train_indexed)
best_model_dt = cv_model_dt.bestModel
predictions_dt = best_model_dt.transform(test_indexed)
accuracy_dt = evaluator_dt.evaluate(predictions_dt)
print("Accuracy of Decision Tree: {:.2%}".format(accuracy_dt))

Accuracy of Decision Tree: 59.84%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=42, maxDepth=7, maxBins=31, minInstancesPerNode=8)
dt_model = dt.fit(train_indexed)
predictions = dt_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Decision Tree: {:.2%}".format(accuracy))

Accuracy of Decision Tree: 59.84%


In [None]:
# Cross-validation for Multilayer Perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier

mlp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", seed=42, maxIter=2000)
param_grid_mlp = ParamGridBuilder() \
    .addGrid(mlp.layers, [[140, 100, 16], [140, 150, 16], [140, 200, 16]]) \
    .build()
evaluator_mlp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_mlp = CrossValidator(estimator=mlp, estimatorParamMaps=param_grid_mlp, evaluator=evaluator_mlp, numFolds=3)
cv_model_mlp = cross_val_mlp.fit(train_indexed)
best_model_mlp = cv_model_mlp.bestModel
predictions_mlp = best_model_mlp.transform(test_indexed)
accuracy_mlp = evaluator_mlp.evaluate(predictions_mlp)
print("Accuracy of Multilayer Perceptron: {:.2%}".format(accuracy_mlp))

Accuracy of Multilayer Perceptron: 65.71%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [140, 100, 16]  # Define layer structure
mlp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", layers=layers, seed=42, maxIter=2000)
mlp_model = mlp.fit(train_indexed)
predictions = mlp_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Multilayer Perceptron: {:.2%}".format(accuracy))

Accuracy of Multilayer Perceptron: 65.71%


In [None]:
# Cross-validation for Logistic Regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", family="multinomial")
param_grid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, [0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]) \
    .addGrid(lr.tol, [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]) \
    .build()
evaluator_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cross_val_lr = CrossValidator(estimator=lr, estimatorParamMaps=param_grid_lr, evaluator=evaluator_lr, numFolds=5)
cv_model_lr = cross_val_lr.fit(train_indexed)
best_model_lr = cv_model_lr.bestModel
predictions_lr = best_model_lr.transform(test_indexed)
accuracy_lr = evaluator_lr.evaluate(predictions_lr)
print("Accuracy of Logistic Regression: {:.2%}".format(accuracy_lr))

Accuracy of Logistic Regression: 69.15%


In [None]:
# Directly get the accuracy of the best hyper-tuned model derived in the above cell of cross-validation
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", family="multinomial", regParam=1e-3, tol=1e-3)
lr_model = lr.fit(train_indexed)
predictions = lr_model.transform(test_indexed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Logistic Regression: {:.2%}".format(accuracy))

Accuracy of Logistic Regression: 69.15%
