In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
findspark.find()

'C:\\mystuff\\gatech\\project\\spark-2.4.1-bin-hadoop2.7'

In [4]:
import pyspark

In [5]:
from pyspark import SparkContext

In [6]:
from pyspark import SparkConf

In [7]:
from pyspark.sql import SparkSession

In [8]:
conf = pyspark.SparkConf().setAppName('appName').setMaster('local[*]')
conf.set("spark.driver.memory","2g")
conf.set("spark.executor.memory","2g")
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [9]:
movies_raw_data = spark.read.option("header",'true').csv("file:///C:/mystuff/courses/recommendation_engine/data/ml-latest-small/movies.csv")
ratings_raw_data = spark.read.option("header",'true').csv("file:///C:/mystuff/courses/recommendation_engine/data/ml-latest-small/ratings.csv")

In [10]:
# movies_raw_data.cache()
# movies_raw_data.cache()
movies_raw_data.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [13]:
# print(genres_vals)
genres_dict = {}
genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for i in range(len(genres)):
    genres_dict[genres[i]] = i
# genres_dict

In [14]:
def fetchGenVals(m_genres):
    genres_vals = [0]*19 
    genres_lst = m_genres.split('|')
    for v in genres_lst:
        if v in genres_dict.keys():
            genres_vals[int(genres_dict[v])] = 1
        else:
            genres_vals[0] = 1        
    return genres_vals

In [15]:
print(fetchGenVals('Action|Adventure'))

[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [16]:
movies_data = movies_raw_data.rdd.map(lambda x : (x[0],x[1],x[2],fetchGenVals(x[2]))).toDF(['movie_id','title','genres','genre_vals'])

In [17]:
movies_data.show(5)

+--------+--------------------+--------------------+--------------------+
|movie_id|               title|              genres|          genre_vals|
+--------+--------------------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|[0, 0, 1, 1, 1, 1...|
|       2|      Jumanji (1995)|Adventure|Childre...|[0, 0, 1, 0, 1, 0...|
|       3|Grumpier Old Men ...|      Comedy|Romance|[0, 0, 0, 0, 0, 1...|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|[0, 0, 0, 0, 0, 1...|
|       5|Father of the Bri...|              Comedy|[0, 0, 0, 0, 0, 1...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [18]:
import math
from scipy.spatial import distance

def cosine_genre(genre1,genre2):
    return str(distance.cosine(genre1,genre2))


In [19]:
ratings_raw_data = ratings_raw_data.drop('timestamp').drop('userId').withColumn('rating',ratings_raw_data.rating.cast('double'))

In [20]:
ratings_avg_data =  ratings_raw_data.groupby('movieId').avg('rating')

In [21]:
ratings_data = ratings_avg_data[ratings_avg_data['avg(rating)'] > 3.8]

In [22]:
ratings_data.show(5)

+-------+------------------+
|movieId|       avg(rating)|
+-------+------------------+
|    296| 4.197068403908795|
|   1090| 3.984126984126984|
| 115713|3.9107142857142856|
|  48738|             3.975|
| 121007|               4.0|
+-------+------------------+
only showing top 5 rows



In [23]:
movies_rating_data = movies_data.join(ratings_data,movies_data.movie_id == ratings_data.movieId).drop("movieId").withColumnRenamed('avg(rating)','rating')

In [24]:
movies_rating_data.show(5)

+--------+-----------------+--------------------+--------------------+------------------+
|movie_id|            title|              genres|          genre_vals|            rating|
+--------+-----------------+--------------------+--------------------+------------------+
|       1| Toy Story (1995)|Adventure|Animati...|[0, 0, 1, 1, 1, 1...|3.9209302325581397|
|       6|      Heat (1995)|Action|Crime|Thri...|[0, 1, 0, 0, 0, 0...| 3.946078431372549|
|      14|     Nixon (1995)|               Drama|[0, 0, 0, 0, 0, 0...|3.8333333333333335|
|      16|    Casino (1995)|         Crime|Drama|[0, 0, 0, 0, 0, 0...| 3.926829268292683|
|      28|Persuasion (1995)|       Drama|Romance|[0, 0, 0, 0, 0, 0...|4.2272727272727275|
+--------+-----------------+--------------------+--------------------+------------------+
only showing top 5 rows



In [25]:
movies_rating_data.createOrReplaceTempView("movies_rating_data")
m11 = spark.sql("select * from movies_rating_data where movie_id = 1")
m22 = spark.sql("select * from movies_rating_data where movie_id != 1")

In [26]:
m11.createOrReplaceTempView("m11")
m22.createOrReplaceTempView("m22")

In [27]:
combined_data_ds = spark.sql("select m1.movie_id,m1.title,m1.genre_vals,m1.rating,m2.movie_id,m2.title,m2.genre_vals,m2.rating from m11 m1, m22 m2 where m1.movie_id != m2.movie_id")
# combined_data_ds.cache()
# combined_data_ds = combined_data_ds.coalesce(8)

In [28]:
combined_data_ds.show(5)

+--------+----------------+--------------------+------------------+--------+--------------------+--------------------+------------------+
|movie_id|           title|          genre_vals|            rating|movie_id|               title|          genre_vals|            rating|
+--------+----------------+--------------------+------------------+--------+--------------------+--------------------+------------------+
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|3.9209302325581397|       6|         Heat (1995)|[0, 1, 0, 0, 0, 0...| 3.946078431372549|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|3.9209302325581397|      14|        Nixon (1995)|[0, 0, 0, 0, 0, 0...|3.8333333333333335|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|3.9209302325581397|      16|       Casino (1995)|[0, 0, 0, 0, 0, 0...| 3.926829268292683|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|3.9209302325581397|      28|   Persuasion (1995)|[0, 0, 0, 0, 0, 0...|4.2272727272727275|
|       1|Toy Story (1995)|[0, 0, 

In [29]:
result = combined_data_ds.rdd.map(lambda x: (x[0],x[1],x[4],x[5],cosine_genre(x[2],x[6]),abs(x[3] - x[7]))).toDF(['id1','movie1','id2','movie2','similarity','rating_sim'])

In [31]:
result.show(5)

+---+----------------+---+--------------------+----------+--------------------+
|id1|          movie1|id2|              movie2|similarity|          rating_sim|
+---+----------------+---+--------------------+----------+--------------------+
|  1|Toy Story (1995)|  6|         Heat (1995)|       1.0| 0.02514819881440955|
|  1|Toy Story (1995)| 14|        Nixon (1995)|       1.0| 0.08759689922480618|
|  1|Toy Story (1995)| 16|       Casino (1995)|       1.0|0.005899035734543201|
|  1|Toy Story (1995)| 28|   Persuasion (1995)|       1.0| 0.30634249471458785|
|  1|Toy Story (1995)| 29|City of Lost Chil...|       0.6| 0.09222766217870282|
+---+----------------+---+--------------------+----------+--------------------+
only showing top 5 rows



In [32]:
result.filter(result['similarity'] < 0.1).orderBy(['similarity','rating_sim'],ascending=True).show(10)

+---+----------------+------+--------------------+-------------------+--------------------+
|id1|          movie1|   id2|              movie2|         similarity|          rating_sim|
+---+----------------+------+--------------------+-------------------+--------------------+
|  1|Toy Story (1995)|  4886|Monsters, Inc. (2...|                0.0|0.049718111346018556|
|  1|Toy Story (1995)|  3114|  Toy Story 2 (1999)|                0.0| 0.06010549029009837|
|  1|Toy Story (1995)| 91355|Asterix and the V...|                0.0|  1.0790697674418603|
|  1|Toy Story (1995)|108932|The Lego Movie (2...|0.08712907082472321|0.049962490622655675|
|  1|Toy Story (1995)|  4306|        Shrek (2001)|0.08712907082472321| 0.05328317373461022|
|  1|Toy Story (1995)| 92348|Puss in Boots (Na...|0.08712907082472321| 0.07906976744186034|
|  1|Toy Story (1995)|134853|   Inside Out (2015)|0.08712907082472321| 0.10697674418604652|
|  1|Toy Story (1995)| 78499|  Toy Story 3 (2010)|0.08712907082472321|  0.188160