In [1]:
#load Spark
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
ss = SparkSession.builder.appName("project1").getOrCreate()

In [163]:
import pandas as pd
import boto3
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, avg
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator


In [152]:
df1 = pd.read_csv("s3://502projectsong/cleaned-subset.csv")
#df1.head(5)

In [168]:
df = ss.read.format("csv").option("inferSchema", 
           True).option("header", True).load("s3://502projectsong/cleaned-subset.csv")

In [169]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
df = df.drop('_c0', 'analysis_sample_rate', 'artist_id', 'artist_location', 'artist_mbid', 'audio_md5', 'artist_7digitalid', 'release_7digitalid',
             'artist_name', 'artist_playmeid', 'track_id', 'title', 'song_id', 'release')

In [170]:
#df.show(5)
df=df.filter(df.song_hotttnesss.isNotNull())

In [171]:
inputs = [i for i in df.schema.names if i != 'song_hotttnesss']
# def fill_with_mean(df, exclude=set()): 
#     stats = df.agg(*(
#         avg(c).alias(c) for c in df.columns if c not in exclude
#     ))
#     return df.na.fill(stats.first().asDict())

# df = fill_with_mean(df, inputs)
# df.count()

In [172]:
df = df.dropna(how='any')

In [173]:
vectorAssembler = VectorAssembler(inputCols =inputs, outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df = v_df.select(['features', 'song_hotttnesss'])
v_df.show(3)

+--------------------+------------------+
|            features|   song_hotttnesss|
+--------------------+------------------+
|[0.63642364495006...|0.4051157216913865|
|[0.83996276815103...|0.6665278462297023|
|[0.43541581815506...|0.4952936212921635|
+--------------------+------------------+
only showing top 3 rows



In [174]:
Scalerizer=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
v_df = Scalerizer.fit(v_df).transform(v_df)
v_df.show(3)

+--------------------+------------------+--------------------+
|            features|   song_hotttnesss|     Scaled_features|
+--------------------+------------------+--------------------+
|[0.63642364495006...|0.4051157216913865|[5.19269722294872...|
|[0.83996276815103...|0.6665278462297023|[6.85341025300907...|
|[0.43541581815506...|0.4952936212921635|[3.55263750443960...|
+--------------------+------------------+--------------------+
only showing top 3 rows



In [175]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [176]:
lr = LinearRegression(featuresCol = 'Scaled_features', labelCol='song_hotttnesss', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 0.4321155394512207


In [177]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.217330
r2: -0.000000


In [178]:
train_df.describe().show()


+-------+------------------+
|summary|   song_hotttnesss|
+-------+------------------+
|  count|               850|
|   mean|0.4321155394512207|
| stddev|0.2174577580108071|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [179]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","song_hotttnesss","Scaled_features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="song_hotttnesss",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-------------------+--------------------+
|        prediction|    song_hotttnesss|     Scaled_features|
+------------------+-------------------+--------------------+
|0.4321155394512207|0.21204540548371908|[2.18320427077388...|
|0.4321155394512207|                0.0|[2.70782201816297...|
|0.4321155394512207| 0.2538347361322313|[2.70782201816297...|
|0.4321155394512207|                0.0|[3.06129022648031...|
|0.4321155394512207|                0.0|[3.25426025899887...|
+------------------+-------------------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = -0.00264938


In [180]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 0.224892


In [181]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 1
objectiveHistory: [0.5000000000000002]
+--------------------+
|           residuals|
+--------------------+
| 0.04123100165756688|
| 0.13113135982943314|
|-0.22007013396750164|
|-0.16516035317568223|
| -0.1662544902405706|
|-0.11254955256835575|
|-0.21703522094199276|
| -0.4321155394512207|
| -0.4321155394512207|
| -0.4321155394512207|
|-0.11254955256835575|
| -0.4321155394512207|
|-0.17828080331898943|
|-0.22007013396750164|
|-0.02699981775983...|
| -0.4321155394512207|
|  0.1591758339365862|
|-0.17177729688379023|
| 0.06961054229423691|
|  0.0305276431404497|
+--------------------+
only showing top 20 rows



In [183]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'song_hotttnesss')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 0.214044


In [184]:
dt_model.featureImportances


SparseVector(40, {0: 0.4537, 1: 0.2062, 2: 0.0138, 3: 0.0648, 4: 0.0195, 6: 0.0505, 11: 0.0101, 15: 0.008, 18: 0.01, 19: 0.0631, 21: 0.0382, 33: 0.0159, 38: 0.0462})