In [None]:
# Import required packages/libraries:
import os
import sys
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnan, when, count, col

# Dependencies for Regression Algorithms:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
# Define sql context:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
# Load train and test datasets form S3:

train_df = spark.read.parquet("s3://projectmlldsongs/songs_data_split/type=train/")
test_df = spark.read.parquet("s3://projectmlldsongs/songs_data_split/type=test/")

In [None]:
##Linear Regression Model

In [None]:
# Convert Rows of dataframe to Dense Vectors:
assembler = VectorAssembler(
    inputCols=['artist_familiarity','duration','loudness','key_confidence','key','end_of_fade_in',
               'time_signature_confidence','tempo','mode'],
    outputCol='features')

# Create linear regression object
lr = LinearRegression(labelCol='song_hotttnesss', featuresCol='features')

# Create a pipeline to sequentially perform operations on the training set:
pipeline = Pipeline(stages=[assembler, lr])

In [None]:
# Perform 4-fold cross validation on the train dataset:
#paramGrid = ParamGridBuilder()\
#    .addGrid(lr.regParam, [0.1, 0.01]) \
#    .addGrid(lr.fitIntercept, [False, True])\
#    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
#    .build()

#crossval = CrossValidator(estimator=pipeline,
#                          estimatorParamMaps=paramGrid,
#                          evaluator=RegressionEvaluator(metricName="rmse",labelCol="song_hotttnesss",predictionCol="prediction"),
#                          numFolds=4)


#Train the model:
lrmodel = pipeline.fit(train_df)


In [None]:
# Make predictions on test set
predictions = lrmodel.transform(test_df)

# Select example rows to display.
predictions.select("prediction", "song_hotttnesss", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

# Print RMSE:
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [None]:
# Save trained model object in S3:

lrmodel.save("s3://projectmlldsongs/saved_models/linear_regression/")

In [None]:
# PCA snippet: to be included later....

#pca = PCA(k=2, inputCol="features",outputCol="features")

# Add the pca object to Pipeline