### 1. Start SparkSession

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test SparkSession") \
     .getOrCreate()

In [3]:
spark.version

'2.4.4'

### 2. Load Data and Convert

In [4]:
df_in=spark.read.load("s3://lusun-bucket1/cleaned-subset.csv",format="csv",header="true")

In [5]:
df_in.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- analysis_sample_rate: string (nullable = true)
 |-- artist_7digitalid: string (nullable = true)
 |-- artist_familiarity: string (nullable = true)
 |-- artist_hotttnesss: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_mbid: string (nullable = true)
 |-- artist_mbtags: string (nullable = true)
 |-- artist_mbtags_count: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_playmeid: string (nullable = true)
 |-- artist_terms: string (nullable = true)
 |-- artist_terms_freq: string (nullable = true)
 |-- artist_terms_weight: string (nullable = true)
 |-- audio_md5: string (nullable = true)
 |-- bars_confidence: string (nullable = true)
 |-- bars_start: string (nullable = true)
 |-- beats_confidence: string (nullable = true)
 |-- beats_start: string (

In [89]:
from pyspark.sql.types import FloatType,IntegerType
from pyspark.sql.functions import when

#convert field type
float_list=['duration','tempo','loudness','energy','danceability']

for item in float_list:
    df_in = df_in.withColumn(item, df_in[item].cast(FloatType()))
    
df_in = df_in.withColumn("year", df_in["year"].cast(IntegerType()))

df_in = df_in.withColumn("period", \
              when(df_in["year"] <1990, 0).otherwise(1))

### 3. Simple Task
Year Recognition: using features of songs to predict the corresponding year.

In [95]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler,StandardScaler
from pyspark.ml import Pipeline

df_in.select('period').show()

assembler = VectorAssembler(
    inputCols=float_list,
    outputCol="features")

+------+
|period|
+------+
|     1|
|     0|
|     1|
|     0|
|     1|
|     1|
|     1|
|     1|
|     0|
|     1|
|     1|
|     1|
|     1|
|     1|
|     0|
|     0|
|     1|
|     1|
|     1|
|     1|
+------+
only showing top 20 rows



In [91]:
train, test = df_in.randomSplit([0.9, 0.1], seed=20)
p1=Pipeline(stages=index_list+[assembler])
training_data=p1.fit(train).transform(train)

In [92]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="period", featuresCol="features", numTrees=10)
pipeline=Pipeline(stages=[rf])
model = pipeline.fit(training_data)

In [93]:
test_data=p1.fit(test).transform(test)
predictions = model.transform(test_data)

In [94]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="period", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ",accuracy)

Accuracy:  0.9024630541871921
