In [1]:
# Import required packages/libraries:
import os
import sys
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnan, when, count, col

# Dependencies for Regression Algorithms:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import IsotonicRegression

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,application_1606590675617_0006,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Define sql context:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Read the csv file(stored in DBFS) as a Spark dataframe:
df = spark.read.format("csv").option("inferSchema", "true").option("header","true").load("s3://projectmlldsongs/processed_songs/*.csv")

# Cache the dataframe across all workers:
df.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[_c0: int, artist_familiarity: double, artist_hotttnesss: double, artist_id: string, artist_location: string, artist_mbtags: string, artist_mbtags_count: string, artist_name: string, artist_terms: string, artist_terms_freq: string, artist_terms_weight: string, danceability: string, duration: string, end_of_fade_in: string, energy: string, key: string, key_confidence: string, loudness: string, mode: string, mode_confidence: string, release: string, segments_confidence: string, segments_loudness_max: string, segments_loudness_max_time: string, segments_pitches: string, segments_timbre: string, similar_artists: string, song_hotttnesss: string, song_id: string, start_of_fade_out: string, tempo: string, time_signature: string, time_signature_confidence: string, title: string, track_id: string, year: string]

In [4]:
print(df.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

72594

In [16]:
# Usue Regex to filter numeric entries in target column i.e. 'song_hotttnesss':
expr = "(^[+-]?([0-9]*[.])?[0-9]+)"
df = (df.filter(df.song_hotttnesss.rlike(expr)))
df = df.filter(df.song_hotttnesss > 0.0)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
# Check datatypes of each column:
print(df.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

65269

In [7]:
# Seperate and typecast features having numeric entries:
numeric_columns=['artist_familiarity',
                 'artist_hotttnesss',
                 'duration',
                 'end_of_fade_in',
                 'energy',
                 'mode',
                 'key',
                 'key_confidence',
                 'loudness',
                 'song_hotttnesss',
                 'tempo',
                 'time_signature',
                 'time_signature_confidence',
                 'year']

for c in numeric_columns:
    df=df.withColumn(c,df[c].cast(DoubleType()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
# Drop rows with null/NA values and duplicates:
df = df.na.drop()
df=df.drop_duplicates()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# Select columns for the regression model:
select_df = df.select([ 'artist_familiarity','duration','loudness','key_confidence','key','end_of_fade_in',
                       'time_signature_confidence','tempo','mode','song_hotttnesss'])

# Split dataset into train and validation sets:
train_df,test_df_model =select_df.randomSplit([0.8, 0.2])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
##Linear Regression Model

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
# Convert Rows of dataframe to Dense Vectors:
assembler = VectorAssembler(
    inputCols=['artist_familiarity','duration','loudness','key_confidence','key','end_of_fade_in',
               'time_signature_confidence','tempo','mode'],
    outputCol='features')

# Create linear regression object
lr = LinearRegression(labelCol='song_hotttnesss', featuresCol='features')

# Create a pipeline to sequentially perform operations on the training set:
pipeline = Pipeline(stages=[assembler, lr])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
# Perform 4-fold cross validation on the train dataset:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="rmse",labelCol="song_hotttnesss",predictionCol="prediction"),
                          numFolds=4)

#Train the model:
cvModel = crossval.fit(train_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
# Make predictions on test set
predictions = cvModel.transform(test_df_model)

# Select example rows to display.
predictions.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

# Print RMSE:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-------------------+--------------------+
|         prediction|    song_hotttnesss|            features|
+-------------------+-------------------+--------------------+
| 0.2200421231909089| 0.5272093808715524|[0.21091583790963...|
|0.26372389275325736|0.12084676057822673|[0.29610306078279...|
| 0.2970989265779518| 0.3112961491753868|[0.36477358487734...|
|0.30556764166452155|0.29230590501606524|[0.38027364457028...|
|0.31371388576997944| 0.3131914863826405|[0.39127254956838...|
+-------------------+-------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0619845

In [27]:
# Create GBT regression object:
xgb = GBTRegressor(featuresCol="features",labelCol="song_hotttnesss",maxIter =10)

# Chain indexer and forest in a Pipeline. Use same assembler object as before:
pipeline = Pipeline(stages=[assembler, xgb])

# Train model.  This also runs the indexer.
model_xgb = pipeline.fit(train_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Evaluate the model on test dataset:
predictions_xgb = model_xgb.transform(test_df_model)

# Select example rows to display.
predictions_xgb.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator_xgb = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator_xgb.evaluate(predictions_xgb)

# Print RMSE:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-------------------+--------------------+
|         prediction|    song_hotttnesss|            features|
+-------------------+-------------------+--------------------+
| 0.2755429101233915| 0.5272093808715524|[0.21091583790963...|
| 0.2720092114946163|0.12084676057822673|[0.29610306078279...|
| 0.2761282606180411| 0.3112961491753868|[0.36477358487734...|
| 0.3118809439265052|0.29230590501606524|[0.38027364457028...|
|0.30198080163197283| 0.3131914863826405|[0.39127254956838...|
+-------------------+-------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0581387

In [24]:
# Create Isotonic Regression object:
isoreg = IsotonicRegression(featuresCol="features",labelCol="song_hotttnesss")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[assembler, isoreg])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# Perform 4-fold cross validation:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName ='rmse',labelCol="song_hotttnesss"),
                          numFolds=4)

# Train the model:
model_isoreg = crossval.fit(train_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
# Evaluate the model on test dataset:
predictions_isoreg = model_isoreg.transform(test_df_model)

# Select example rows to display.
predictions_isoreg.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator_isoreg = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator_isoreg.evaluate(predictions_isoreg)

# Print RMSE
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-------------------+--------------------+
|         prediction|    song_hotttnesss|            features|
+-------------------+-------------------+--------------------+
|0.27857062195707394| 0.5272093808715524|[0.21091583790963...|
|0.27857062195707394|0.12084676057822673|[0.29610306078279...|
|0.29529149659301146| 0.3112961491753868|[0.36477358487734...|
| 0.3006039247232535|0.29230590501606524|[0.38027364457028...|
| 0.3006039247232535| 0.3131914863826405|[0.39127254956838...|
+-------------------+-------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0575058