In [0]:
# import h5py
# import tables
import os
import sys
import pyspark.ml
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA

In [0]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [0]:
dbutils.fs.cp("/FileStore/shared_uploads/rohanb@andrew.cmu.edu/song_small.zip", "file:/tmp/file.zip")

In [0]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/FileStore/shared_uploads/rohanb@andrew.cmu.edu/song_small.zip.csv')

In [0]:
%sh
unzip /FileStore/shared_uploads/rohanb@andrew.cmu.edu/song_small.zip

In [0]:
%sh
cp rfc_model /dbfs/FileStore/tables

In [0]:
%sh
unzip /tmp/file.zip

In [0]:
!pwd

In [0]:
dbutils.fs.mv("file:/databricks/driver/song_small.csv", "dbfs:/tmp/song_small.csv")  

In [0]:
df = spark.read.format("csv").option("inferSchema", "true").option("header","true").load("dbfs:/tmp/song_small.csv")

In [0]:
df.cache()

In [0]:
expr = "(^[+-]?([0-9]*[.])?[0-9]+)"
df = (df.filter(df.song_hotttnesss.rlike(expr)))

In [0]:
from pyspark.sql.types import DoubleType
# test_df = test_df_2.withColumn("song_hotttnesss", test_df_2["song_hotttnesss"].cast(DoubleType()))

In [0]:
display(df.dtypes)

_1,_2
artist_familiarity,double
artist_hotttnesss,double
artist_id,string
artist_location,string
artist_mbtags,string
artist_mbtags_count,string
artist_name,string
artist_terms,string
artist_terms_freq,string
artist_terms_weight,string


In [0]:
numeric_columns=['artist_familiarity',
 'artist_hotttnesss',
 'duration',
 'end_of_fade_in',
 'energy',
 'mode',
 'key',
 'key_confidence',
 'loudness',
 'song_hotttnesss',
 'tempo',
 'time_signature',
 'time_signature_confidence',
 'year']

In [0]:
for c in numeric_columns:
  df=df.withColumn(c,df[c].cast(DoubleType()))

In [0]:
df.stat.corr("artist_hotttnesss","song_hotttnesss")
# df = df.filter(df.artist_hotttnesss > 0)

In [0]:
from pyspark.sql.functions import isnan, when, count, col

In [0]:
df = df.na.drop()
df=df.drop_duplicates()
df.stat.corr("artist_hotttnesss","song_hotttnesss")

In [0]:
select_df = df.select([ 'loudness', 'song_hotttnesss','mode','key','artist_familiarity','tempo'])
train_df,test_df_model =select_df.randomSplit([0.75, 0.25])

Linear Regression Model

In [0]:
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

assembler = VectorAssembler(
    inputCols=["loudness",'key','mode','artist_familiarity','tempo'],
    outputCol="features2")

lr = LinearRegression(labelCol='song_hotttnesss', featuresCol='features2')

pipeline = Pipeline(stages=[assembler, lr])

# model = pipeline.fit(train_df)

Validation and Hyperparameter tuning

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="rmse",labelCol="song_hotttnesss",predictionCol="prediction"),
                          numFolds=4)  # use 3+ folds in practice

In [0]:
cvModel = crossval.fit(train_df)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
# Make predictions.
predictions = cvModel.transform(test_df_model)

# Select example rows to display.
predictions.select("prediction", "song_hotttnesss", "features2").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
print(f" The RMSE is {model.stages[1].summary.rootMeanSquaredError}")
print(f" The r2 value is {model.stages[1].summary.r2}")

In [0]:
train_df.describe().show()

##Gradient Boosted Tree Regression Model

In [0]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
featureIndexer =\
    VectorAssembler(inputCols=['artist_familiarity', 'loudness','key','mode','tempo'], outputCol="features_out")


In [0]:
xgb = GBTRegressor(featuresCol="features_out",labelCol="song_hotttnesss",maxIter =10)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, xgb])

# Train model.  This also runs the indexer.
model_xgb = pipeline.fit(train_df)

In [0]:
predictions_xgb = model_xgb.transform(test_df_model)

# Select example rows to display.
predictions_xgb.select("prediction", "song_hotttnesss", "features_out").show(5)

# Select (prediction, true label) and compute test error
evaluator_xgb = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator_xgb.evaluate(predictions_xgb)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model_xgb.stages[1]