### 1. Start Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test SparkSession") \
     .getOrCreate()

In [3]:
spark.version

'2.4.4'

### 2. Handle Data

In [4]:
df_in=spark.read.load("s3://lusun-bucket1/clean_big_subset.csv",format="csv",header="true")
df_in.na.drop()

DataFrame[_c0: string, analysis_sample_rate: string, artist_familiarity: string, artist_hotttnesss: string, artist_id: string, artist_latitude: string, artist_location: string, artist_longitude: string, artist_name: string, artist_terms: string, artist_terms_freq: string, bars_confidence: string, bars_start: string, beats_confidence: string, beats_start: string, danceability: string, duration: string, end_of_fade_in: string, energy: string, key: string, key_confidence: string, loudness: string, mode: string, mode_confidence: string, release: string, sections_confidence: string, sections_start: string, segments_confidence: string, segments_loudness_max: string, segments_loudness_max_time: string, segments_loudness_start: string, segments_pitches: string, segments_start: string, segments_timbre: string, similar_artists: string, song_hotttnesss: string, song_id: string, start_of_fade_out: string, tatums_confidence: string, tatums_start: string, tempo: string, time_signature: string, time_

In [5]:
df_in.take(1)

[Row(_c0='0', analysis_sample_rate='22050', artist_familiarity='0.647933622', artist_hotttnesss='0.482022827', artist_id='AR4PQ891187FB5CA9F', artist_latitude='40.76099', artist_location='East Orange, NJ', artist_longitude='-74.20991', artist_name='Dionne Warwick', artist_terms="['brill building pop', 'quiet storm', 'ballad', 'easy listening', 'motown', 'disco', 'soul jazz', 'smooth jazz', 'soul', 'jazz', 'soft rock', 'uk garage', 'chill-out', 'german pop', 'salsa', 'r&b', 'chanson', 'rock', 'pop', 'blues-rock', 'vocal jazz', 'funk', 'oldies', 'pop rock', 'downtempo', 'hip hop', 'classic rock', 'united states', 'germany', 'adult contemporary', 'folk rock', 'vocal', 'soundtrack', 'blues', 'female vocalist', 'electronic', 'new wave', 'urban', 'reggae', 'singer-songwriter', 'swing', '60s', 'female', 'american', '80s', '90s', 'ambient']", artist_terms_freq='0.790175611', bars_confidence='0.098915094', bars_start='109.3454292', beats_confidence='0.555841121', beats_start='110.1107134', danc

In [6]:
from pyspark.sql.types import FloatType,IntegerType
from pyspark.sql.functions import when

#convert field type
float_list=['duration','tempo','loudness','energy','danceability','song_hotttnesss','artist_hotttnesss','artist_familiarity','beats_confidence'
           ,'sections_confidence','segments_confidence']

for item in float_list:
    df_in = df_in.withColumn(item, df_in[item].cast(FloatType()))
    
df_in = df_in.withColumn("year", df_in["year"].cast(IntegerType()))

df_in = df_in.withColumn("period", \
              when(df_in["year"] <1960, 0).when(df_in["year"] <1970, 1).when(df_in["year"] <1980, 2)\
                         .when(df_in["year"] <1990, 3).when(df_in["year"] <2000, 4).when(df_in["year"] <2010, 5).when(df_in["year"] <2020, 6).otherwise(7))

### 3.Fit Model

In [15]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler,StandardScaler
from pyspark.ml import Pipeline

df_in.select('period').show()

assembler = VectorAssembler(
    inputCols=float_list,
    outputCol="features",handleInvalid='skip')

+------+
|period|
+------+
|     3|
|     5|
|     4|
|     4|
|     0|
|     5|
|     0|
|     0|
|     4|
|     1|
|     0|
|     5|
|     4|
|     5|
|     4|
|     5|
|     0|
|     4|
|     4|
|     4|
+------+
only showing top 20 rows



In [16]:
train, test = df_in.randomSplit([0.9, 0.1], seed=20)
p1=Pipeline(stages=[assembler])
training_data=p1.fit(train).transform(train)

In [17]:
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier
rf = RandomForestClassifier(labelCol="period", featuresCol="features", numTrees=20)
pipeline=Pipeline(stages=[rf])
model = pipeline.fit(training_data)

In [18]:
test_data=p1.fit(test).transform(test)
predictions = model.transform(test_data)

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="period", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ",accuracy)

Accuracy:  0.5509411413205856
