In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=229a242877c6706c8d7374fe0e41dcae66343554033ac55232da7f2b2bca8d0d
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Start Spark session
spark = SparkSession.builder.appName('SpotifyPopularityPrediction').getOrCreate()

# Load data
df = spark.read.csv('gs://bigdata_spotify/preprocessed_spotify.csv', header=True, inferSchema=True)

# Selecting features and target variable
feature_cols = ['danceability', 'energy', 'valence', 'tempo', 'acousticness']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(df)

# Select columns that we need
final_data = data.select('features', 'popularity')

# Split data into training and testing set
train_data, test_data = final_data.randomSplit([0.8, 0.2])

# Define the model
lr = LinearRegression(featuresCol='features', labelCol='popularity')

# Fit the model
lr_model = lr.fit(train_data)

# Predict on the test set
predictions = lr_model.transform(test_data)

# Show some predictions
predictions.select('prediction', 'popularity').show()

# Save predictions to Cloud Storage or BigQuery
predictions.write.csv('gs://bigdata_spotify/output/predictions.csv')
