In [0]:
# Import required packages/libraries:
import os
import sys
import pyspark.ml
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnan, when, count, col

# Dependencies for Regression Algorithms:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import IsotonicRegression

In [0]:
# Define sql context:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [0]:
# Read the csv file(stored in DBFS) as a Spark dataframe:
df = spark.read.format("csv").option("inferSchema", "true").option("header","true").load("dbfs:/tmp/song_small.csv")

# Cache the dataframe across all workers:
df.cache()

In [0]:
# Usue Regex to filter numeric entries in target column i.e. 'song_hotttnesss':
expr = "(^[+-]?([0-9]*[.])?[0-9]+)"
df = (df.filter(df.song_hotttnesss.rlike(expr)))

In [0]:
# Check datatypes of each column:
display(df.dtypes)

_1,_2
artist_familiarity,double
artist_hotttnesss,double
artist_id,string
artist_location,string
artist_mbtags,string
artist_mbtags_count,string
artist_name,string
artist_terms,string
artist_terms_freq,string
artist_terms_weight,string


In [0]:
# Seperate and typecast features having numeric entries:
numeric_columns=['artist_familiarity',
                 'artist_hotttnesss',
                 'duration',
                 'end_of_fade_in',
                 'energy',
                 'mode',
                 'key',
                 'key_confidence',
                 'loudness',
                 'song_hotttnesss',
                 'tempo',
                 'time_signature',
                 'time_signature_confidence',
                 'year']

for c in numeric_columns:
  df=df.withColumn(c,df[c].cast(DoubleType()))

In [0]:
# Drop rows with null/NA values and duplicates:
df = df.na.drop()
df=df.drop_duplicates()

In [0]:
# Select columns for the regression model:
select_df = df.select([ 'artist_familiarity','duration','loudness','key_confidence','key','end_of_fade_in',
                       'time_signature_confidence','tempo','mode','song_hotttnesss'])

# Split dataset into train and validation sets:
train_df,test_df_model =select_df.randomSplit([0.8, 0.2])

##Linear Regression Model

In [0]:
# Convert Rows of dataframe to Dense Vectors:
assembler = VectorAssembler(
    inputCols=['artist_familiarity','duration','loudness','key_confidence','key','end_of_fade_in',
               'time_signature_confidence','tempo','mode'],
    outputCol='features')

# Create linear regression object
lr = LinearRegression(labelCol='song_hotttnesss', featuresCol='features')

# Create a pipeline to sequentially perform operations on the training set:
pipeline = Pipeline(stages=[assembler, lr])

In [0]:
# Perform 4-fold cross validation on the train dataset:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="rmse",labelCol="song_hotttnesss",predictionCol="prediction"),
                          numFolds=4)

#Train the model:
cvModel = crossval.fit(train_df)

In [0]:
# Make predictions on test set
predictions = cvModel.transform(test_df_model)

# Select example rows to display.
predictions.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

# Print RMSE:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

##Gradient Boosted Tree Regression Model

In [0]:
# Create GBT regression object:
xgb = GBTRegressor(featuresCol="features",labelCol="song_hotttnesss",maxIter =10)

# Chain indexer and forest in a Pipeline. Use same assembler object as before:
pipeline = Pipeline(stages=[assembler, xgb])

# Train model.  This also runs the indexer.
model_xgb = pipeline.fit(train_df)

In [0]:
# Evaluate the model on test dataset:
predictions_xgb = model_xgb.transform(test_df_model)

# Select example rows to display.
predictions_xgb.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator_xgb = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator_xgb.evaluate(predictions_xgb)

# Print RMSE:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

##Isotonic Regression

In [0]:
# Create Isotonic Regression object:
isoreg = IsotonicRegression(featuresCol="features",labelCol="song_hotttnesss")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[assembler, isoreg])

In [0]:
# Perform 4-fold cross validation:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName ='rmse',labelCol="song_hotttnesss"),
                          numFolds=4)

# Train the model:
model_isoreg = crossval.fit(train_df)

In [0]:
# Evaluate the model on test dataset:
predictions_isoreg = model_isoreg.transform(test_df_model)

# Select example rows to display.
predictions_isoreg.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator_isoreg = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator_isoreg.evaluate(predictions_isoreg)

# Print RMSE
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)