In [1]:
#load Spark
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
ss = SparkSession.builder.appName("project1").getOrCreate()

In [53]:
import pandas as pd
import boto3
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [32]:
df1 = pd.read_csv("s3://502projectsong/cleaned-subset.csv")
#df1.head(5)

In [33]:
df = ss.read.format("csv").option("inferSchema", 
           True).option("header", True).load("s3://502projectsong/cleaned-subset.csv")

In [34]:
df = df.drop('_c0', 'analysis_sample_rate', 'artist_id', 'artist_location', 'artist_mbid', 'audio_md5', 'artist_7digitalid', 'release_7digitalid',
             'artist_name', 'artist_playmeid', 'track_id', 'title', 'song_id', 'release')

In [35]:
#df.show(5)
df=df.filter(df.song_hotttnesss.isNotNull())
df = df.withColumn('song_hotttnesss',when(df.song_hotttnesss <= 0.5, 0).otherwise(1))

In [37]:
inputs = [i for i in df.schema.names if i != 'song_hotttnesss']
# def fill_with_mean(df, exclude=set()): 
#     stats = df.agg(*(
#         avg(c).alias(c) for c in df.columns if c not in exclude
#     ))
#     return df.na.fill(stats.first().asDict())

# df = fill_with_mean(df, inputs)
# df.count()

In [38]:
df = df.dropna(how='any')

In [39]:
vectorAssembler = VectorAssembler(inputCols =inputs, outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df = v_df.select(['features', 'song_hotttnesss'])
v_df.show(3)

+--------------------+---------------+
|            features|song_hotttnesss|
+--------------------+---------------+
|[0.63642364495006...|              0|
|[0.83996276815103...|              1|
|[0.43541581815506...|              0|
+--------------------+---------------+
only showing top 3 rows



In [40]:
Scalerizer=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
v_df = Scalerizer.fit(v_df).transform(v_df)
v_df.show(3)

+--------------------+---------------+--------------------+
|            features|song_hotttnesss|     Scaled_features|
+--------------------+---------------+--------------------+
|[0.63642364495006...|              0|[5.19269722294872...|
|[0.83996276815103...|              1|[6.85341025300907...|
|[0.43541581815506...|              0|[3.55263750443960...|
+--------------------+---------------+--------------------+
only showing top 3 rows



In [41]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [47]:
#lr = LinearRegression(featuresCol = 'Scaled_features', labelCol='song_hotttnesss', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr = LogisticRegression(featuresCol = 'Scaled_features', labelCol='song_hotttnesss', maxIter=10, regParam=0.3, elasticNetParam=0.8)

lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: (40,[],[])
Intercept: -0.45363166637630403


In [49]:
trainingSummary = lr_model.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

objectiveHistory:
0.6680713226000747
+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5


In [50]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.6115023474178404
FPR: 0.6115023474178404
TPR: 0.6115023474178404
F-measure: 0.4640826263724615
Precision: 0.37393512089752917
Recall: 0.6115023474178404


In [51]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","song_hotttnesss","Scaled_features").show(5)

+----------+---------------+--------------------+
|prediction|song_hotttnesss|     Scaled_features|
+----------+---------------+--------------------+
|       0.0|              0|[0.34960685551937...|
|       0.0|              1|[1.25329583464309...|
|       0.0|              0|[2.94231306058092...|
|       0.0|              0|[3.02045759901343...|
|       0.0|              0|[3.06129022648031...|
+----------+---------------+--------------------+
only showing top 5 rows



In [55]:
#test_result = lr_model.evaluate(test_df)
#print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(lr_predictions))


IllegalArgumentException: 'Field "label" does not exist.\nAvailable fields: features, song_hotttnesss, Scaled_features, rawPrediction, probability, prediction'

In [181]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 1
objectiveHistory: [0.5000000000000002]
+--------------------+
|           residuals|
+--------------------+
| 0.04123100165756688|
| 0.13113135982943314|
|-0.22007013396750164|
|-0.16516035317568223|
| -0.1662544902405706|
|-0.11254955256835575|
|-0.21703522094199276|
| -0.4321155394512207|
| -0.4321155394512207|
| -0.4321155394512207|
|-0.11254955256835575|
| -0.4321155394512207|
|-0.17828080331898943|
|-0.22007013396750164|
|-0.02699981775983...|
| -0.4321155394512207|
|  0.1591758339365862|
|-0.17177729688379023|
| 0.06961054229423691|
|  0.0305276431404497|
+--------------------+
only showing top 20 rows



In [183]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'song_hotttnesss')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="song_hotttnesss", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 0.214044


In [184]:
dt_model.featureImportances


SparseVector(40, {0: 0.4537, 1: 0.2062, 2: 0.0138, 3: 0.0648, 4: 0.0195, 6: 0.0505, 11: 0.0101, 15: 0.008, 18: 0.01, 19: 0.0631, 21: 0.0382, 33: 0.0159, 38: 0.0462})