In [7]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
from math import sqrt

In [8]:
def computeCosineSimilarity(spark,data):
    pairScores = data \
        .withColumn("xx",func.col("rating1")*func.col("rating1")) \
        .withColumn("yy",func.col("rating2")*func.col("rating2")) \
        .withColumn("xy",func.col("rating1")*func.col("rating2")) 
    
    calculateSimilarity = pairScores \
        .groupBy("movie1","movie2") \
        .agg(\
            func.sum(func.col("xy")).alias("numerator"), \
            (func.sqrt(func.sum(func.col("xx")))* func.sqrt(func.sum(func.col("yy")))).alias("denominator"),\
            func.count(func.col("xy")).alias("numPairs")
        )
    
    result = calculateSimilarity \
        .withColumn("score", \
           func.when(func.col("denominator")!= 0, func.col("numerator") / func.col("denominator"))\
           .otherwise(0)  
        ).select("movie1","movie2","score","numPairs")
    return result

In [9]:
def getMovieName(movieNames,movieId):
    result = movieNames.filter(func.col("movieID")== movieId) \
        .select("movieTitle").collect()[0]
    
    return result[0]


In [28]:
spark = SparkSession.builder.appName("MovieSimilarities").master("local[*]").getOrCreate()

22/12/14 15:08:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/minhthu/bigdata/spark-3.3.0-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/minhthu/bigdata/spark-3.3.0-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/minhthu/bigdata/spark-3.3.0-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [11]:
movieNamesSchema = StructType([\
                              StructField("movieID", IntegerType(),True), \
                              StructField("movieTitle",StringType(),True) \
                              ])

In [15]:
moviesSchema = StructType([\
                           StructField("userID", IntegerType(),True), \
                           StructField("movieID",IntegerType(),True), \
                           StructField("rating", IntegerType(),True), \
                           StructField("timestamp",LongType(),True)])

In [16]:
movieNames = spark.read \
    .option("sep","|") \
    .option("charset","ISO-8859-1") \
    .schema(movieNamesSchema) \
    .csv("/home/minhthu/Downloads/ml-100k/u.item")

In [17]:
movies = spark.read \
    .option("sep","\t") \
    .schema(moviesSchema) \
    .csv("/home/minhthu/Downloads/ml-100k/u.data")

In [18]:
ratings = movies.select("userId","movieId","rating")

In [22]:
moviePairs = ratings.alias("ratings1") \
    .join(ratings.alias("ratings2"),(func.col("ratings1.userId") == func.col("ratings2.userId"))
         &(func.col("ratings1.movieId") < func.col("ratings2.movieId"))) \
    .select(func.col("ratings1.movieId").alias("movie1"), \
           func.col("ratings2.movieId").alias("movie2"), \
           func.col("ratings1.rating").alias("rating1"), \
           func.col("ratings2.rating").alias("rating2"))

In [23]:
moviePairSimilarities = computeCosineSimilarity(spark,moviePairs).cache()

In [27]:
if (len(sys.argv) > 1):
    
    scoreThreshold = 0.97
    coOccurenceThreshold = 50.0
    
    movieID = int(sys.argv[1])
    
#     filteredResults = moviePairSimilarities.filter(lambda((pair,sim)):(pair[0] == movieID or pair[1] == movieID) and sim[0] > scoreThreshold and sim[1] > coOccurenceThreshold)
    
    filteredResults = moviePairSimilarities.filter( \
        ((func.col("movie1") == movieID) | (func.col("movie2") == movieID)) & \
            (func.col("score") > scoreThreshold) & (func.col("numPairs") > coOccurenceThreshold))
    
    
    results = filteredResults.sort(func.col("score").desc()).take(10)
    
    print ("Top 10 similar movies for" + getMovieName(movieNames,movieID))
    
    for result in results:
        similarMovieID = result.movie1
        if(similarMovieID == movieID):
            similarMovieID = result.movie2
            
        print(getMovieName(movieNames,similarMovieID) + "\tscore: " \
              + str(result.score) + "\tstrength: " + str(result.numPairs))

ValueError: invalid literal for int() with base 10: '-f'