##To generate dummy training dataset

In [0]:
# track_ids = spark.read.csv('s3://souvik-spark-streaming/music_data/track_ids/', inferSchema=True, header=True)

# track_ids = track_ids.select("track_id").distinct().rdd.flatMap(lambda x: x).collect()

# import random

# def generate_user_data():
#     return {
#         'user_id': random.randint(0,5000),
#         'track_id': random.choice(track_ids),
#         'like': random.randint(0,1)
#     }

# user_data = [generate_user_data() for _ in range(10000)]

# from pyspark.sql.types import *
# # Step 3: Create schema
# schema = StructType([
#     StructField("user_id", IntegerType(), False),
#     StructField("track_id", IntegerType(), False),
#     StructField("like", IntegerType(), False)
# ])

# # Step 4: Create DataFrame
# user_df = spark.createDataFrame(user_data, schema)

# # Show sample data
# user_df.show(10)

In [0]:
# # user_data = spark.read.csv("s3a://souvik-spark-streaming/music_data/user_data/", inferSchema=True, header=True)
# # from pyspark.sql.functions import col

# # Aliases for self-join
# df1 = user_df.alias("a")
# df2 = user_df.alias("b")

# # Join on track_id, filter where user_ids are different
# joined = df1.join(df2, (col("a.track_id") == col("b.track_id")) & (col("a.like") == col("b.like")) & (col("a.user_id") != col("b.user_id")))

# # Optional: select the fields you want
# joined.select("a.user_id", "b.user_id", "a.track_id", "a.like").display()


##Training the ALS model for collaborative filtering

In [0]:
%scala
import org.apache.spark.ml.feature._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.recommendation.ALS

val aws_access_key = dbutils.secrets.get(scope="aws", key="aws-access-key-id")
val aws_secret_key = dbutils.secrets.get(scope="aws", key="aws-secret-access-key")

// spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
// spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
// spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")

spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", aws_access_key)
spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", aws_secret_key)
spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "s3.amazonaws.com")

val old_user_events = spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .csv("s3a://souvik-spark-streaming/music_data/user_data/")

val new_user_events = spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .csv("s3a://souvik-dev-stage/new_user_data/")

val combined_user_events = old_user_events.union(new_user_events)

//val positiveUserEvents = user_events.filter("like=1")

// val als = new ALS()
//   .setUserCol("user_id")
//   .setItemCol("track_id")
//   .setRatingCol("like")          // 1 = click (positive signal), 0 = no click
//   .setImplicitPrefs(true)         // Treats ratings as confidence signals
//   .setAlpha(40.0)                 // Confidence weight
//   .setRank(5)
//   .setMaxIter(10)
//   .setRegParam(0.1)
//   .setNonnegative(true)
//   .setColdStartStrategy("drop")

val als = new ALS()
  .setUserCol("user_id")
  .setItemCol("track_id")
  .setRatingCol("like")
  .setImplicitPrefs(true)        // Treats 'like=1' and 'like=0' as implicit feedback
  .setAlpha(40.0)                // High confidence for implicit data (boosts 'like=1' importance)
  .setRank(10)                   // Moderate latent factors
  .setMaxIter(10)                // Enough iterations to converge on smaller datasets
  .setRegParam(0.05)             // Slight regularization, effective for larger datasets
  .setNonnegative(true)          // Non-negative factorization (good for recommendation systems)
  .setColdStartStrategy("drop")  // Drop rows with missing predictions (handle cold-start users/items)


val ALSModel = als.fit(combined_user_events)

In [0]:
%scala
//val user_recommendations = ALSModel.recommendForAllUsers(5)
//user_recommendations.show(false)
ALSModel.write.overwrite().save("s3a://souvik-dev-stage/ALSModel")


In [0]:
%scala
val user_recommendations = ALSModel.recommendForAllUsers(5)
display(user_recommendations.orderBy("user_id"))

##Creating content based filtering model

In [0]:
%scala
val tracks = spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .csv("s3a://souvik-spark-streaming/music_data/tcc_ceds_music.csv")

val tracks_new = tracks.withColumnRenamed("_c0", "track_id")
val excludeCols = Set("track_id", "track_name", "lyrics", "len", "age")
val colsToConcat = tracks_new.columns.filterNot(excludeCols.contains).map(col)
val tracks_final = tracks_new.withColumn("text", concat_ws(" ", colsToConcat: _*))

In [0]:
%scala
import org.apache.spark.ml.feature._
import org.apache.spark.ml.Pipeline

val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("tokens")

val hashingTF = new HashingTF()
  .setInputCol("tokens")
  .setOutputCol("rawFeatures")
  .setNumFeatures(1000)

val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")

val tfidfPipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, idf))

val tfidfModel = tfidfPipeline.fit(tracks_final)
val itemsFeaturized = tfidfModel.transform(tracks_final)

In [0]:
%scala
itemsFeaturized.write 
    .mode("overwrite") 
    .option("header", "true")
    .parquet("s3a://souvik-dev-stage/contentModel/")

In [0]:
%scala
val user_likes = user_events.select("user_id", "track_id")
val likedItems = user_likes.join(itemsFeaturized, "track_id")

import org.apache.spark.ml.linalg.{Vector, Vectors}
import breeze.linalg.{DenseVector => BDV}
import org.apache.spark.sql.functions._

val avgVectorUDF = udf((vectors: Seq[Vector]) => {
  val breezeVecs = vectors.map(v => BDV(v.toArray))
  val sumVec = breezeVecs.reduce(_ + _)
  val avgVec = sumVec / breezeVecs.length.toDouble
  Vectors.dense(avgVec.toArray)
})

val likedVectors = likedItems
  .groupBy("user_id")
  .agg(collect_list($"features").as("featureList"))

val userProfile = likedVectors
  .withColumn("userFeatures", avgVectorUDF($"featureList"))
  .select("user_id", "userFeatures")

import org.apache.spark.ml.linalg.Vector
import breeze.linalg.{DenseVector => BreezeDenseVector, norm}

val cosineSimilarity = udf { (vec1: Vector, vec2: Vector) =>
  val v1 = BreezeDenseVector(vec1.toArray)
  val v2 = BreezeDenseVector(vec2.toArray)
  val dot = v1.dot(v2)
  val normProduct = norm(v1) * norm(v2)
  if (normProduct == 0.0) 0.0 else dot / normProduct
}

val cbRecs = itemsFeaturized
  .crossJoin(userProfile)
  .withColumn("score", cosineSimilarity($"features", $"userFeatures"))
  .select("user_id", "track_id", "track_name", "score")
  .orderBy($"user_id", $"score".desc)


In [0]:
%scala
scored.show(10)

In [0]:
%scala
val alsRecs = alsModel.recommendForAllUsers(10)
  .withColumn("rec", explode($"recommendations"))
  .select($"userId", $"rec.itemId".as("itemId"), $"rec.rating".as("als_score"))

val alpha = 0.7  // weight for ALS; (1 - alpha) for CB

val hybrid = alsRecs.join(cbRecs, Seq("userId", "itemId"), "outer")
  .na.fill(0.0, Seq("als_score", "cb_score"))
  .withColumn("hybrid_score", $"als_score" * alpha + $"cb_score" * (1 - alpha))
  .join(items.select("itemId", "title"), "itemId")
  .orderBy($"userId", $"hybrid_score".desc)
