In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
os.environ["PYSPARK_PYTHON"] = "python"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('course2'). \
    master('local'). \
    getOrCreate()

In [3]:
# from pyspark.sql.functions import *

In [4]:
movie_ratings_by_users_path = './datasets/ml-100k/u.data'
movie_details_path  = './datasets/ml-100k/u.item'
users_details_path = './datasets/ml-100k/u.user'

# details of columns and all present here: https://files.grouplens.org/datasets/movielens/ml-100k-README.txt

In [5]:
# spark.read.csv?

## Reading first dataset
    - file is separated by tab instead of ','
    - providing headers (since they're not already in the file)
    - inferring schema

In [6]:
movie_ratings_header = ['userid', 'itemid', 'rating', 'unix_timestamp']
movie_ratings_data_schema = spark.\
                    read.\
                    option('sep', '\t'). \
                    csv(movie_ratings_by_users_path, header=False, inferSchema=True).schema

In [7]:
for col_index,curr_col in zip(range(0, len(movie_ratings_data_schema)),movie_ratings_data_schema):
    curr_col.name = movie_ratings_header[col_index]

In [8]:
movie_ratings_data_schema

StructType(List(StructField(userid,IntegerType,true),StructField(itemid,IntegerType,true),StructField(rating,IntegerType,true),StructField(unix_timestamp,IntegerType,true)))

In [9]:
movie_ratings_data = spark.\
                    read.\
                    option('sep', '\t'). \
                    csv(movie_ratings_by_users_path, header=False, schema = movie_ratings_data_schema)

In [10]:
movie_ratings_data.show(5)

+------+------+------+--------------+
|userid|itemid|rating|unix_timestamp|
+------+------+------+--------------+
|   196|   242|     3|     881250949|
|   186|   302|     3|     891717742|
|    22|   377|     1|     878887116|
|   244|    51|     2|     880606923|
|   166|   346|     1|     886397596|
+------+------+------+--------------+
only showing top 5 rows



## Reading the second datatset
    - separated by '|'
    - headers not present in file (copied from documentation, link in above tab)

In [11]:
movie_details_headers = '''movie id | movie title | release date | video release date |
              IMDb URL | unknown | Action | Adventure | Animation |
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western |'''.replace('\n', '').replace(' ', '').split('|')

In [12]:
print(movie_details_headers)

['movieid', 'movietitle', 'releasedate', 'videoreleasedate', 'IMDbURL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', '']


In [13]:
movie_details_data_schema = spark.\
                    read.\
                    option('sep', '|'). \
                    csv(movie_details_path, inferSchema=True).schema

In [14]:
for col_index,curr_col in zip(range(0, len(movie_details_data_schema)),movie_details_data_schema):
    curr_col.name = movie_details_headers[col_index]

In [15]:
movie_details_data_schema

StructType(List(StructField(movieid,IntegerType,true),StructField(movietitle,StringType,true),StructField(releasedate,StringType,true),StructField(videoreleasedate,StringType,true),StructField(IMDbURL,StringType,true),StructField(unknown,IntegerType,true),StructField(Action,IntegerType,true),StructField(Adventure,IntegerType,true),StructField(Animation,IntegerType,true),StructField(Children's,IntegerType,true),StructField(Comedy,IntegerType,true),StructField(Crime,IntegerType,true),StructField(Documentary,IntegerType,true),StructField(Drama,IntegerType,true),StructField(Fantasy,IntegerType,true),StructField(Film-Noir,IntegerType,true),StructField(Horror,IntegerType,true),StructField(Musical,IntegerType,true),StructField(Mystery,IntegerType,true),StructField(Romance,IntegerType,true),StructField(Sci-Fi,IntegerType,true),StructField(Thriller,IntegerType,true),StructField(War,IntegerType,true),StructField(Western,IntegerType,true)))

In [16]:
movie_details_data = spark.\
                    read.\
                    option('sep', '|'). \
                    csv(movie_details_path, schema=movie_details_data_schema)

In [17]:
movie_details_data.show(1)

+-------+----------------+-----------+----------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movieid|      movietitle|releasedate|videoreleasedate|             IMDbURL|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+-------+----------------+-----------+----------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|      1|Toy Story (1995)|01-Jan-1995|            null|http://us.imdb.co...|      0|     0|        0|        1|         1|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|
+-------+----------------+-----------+----------------+--------------------+

## Reading the third dataset

In [18]:
user_details_headers = ['userid', 'age', 'gender', 'occupation', 'zip code']
user_details_data_schema = spark.\
                    read.\
                    option('sep', '|'). \
                    csv(users_details_path, inferSchema=True).schema

In [19]:
for col_index,curr_col in zip(range(0, len(user_details_data_schema)),user_details_data_schema):
    curr_col.name = user_details_headers[col_index]

In [20]:
user_details_data = spark.\
                    read.\
                    option('sep', '|'). \
                    csv(users_details_path, schema=user_details_data_schema)

In [21]:
user_details_data.show(2)

+------+---+------+----------+--------+
|userid|age|gender|occupation|zip code|
+------+---+------+----------+--------+
|     1| 24|     M|technician|   85711|
|     2| 53|     F|     other|   94043|
+------+---+------+----------+--------+
only showing top 2 rows



## EDA and cleaning of rating data
- created new column with unix timestamp converted to actual timestamp
- actual timestamp type casted fro string to timestamp data type

In [22]:
movie_ratings_data.show(2)

+------+------+------+--------------+
|userid|itemid|rating|unix_timestamp|
+------+------+------+--------------+
|   196|   242|     3|     881250949|
|   186|   302|     3|     891717742|
+------+------+------+--------------+
only showing top 2 rows



In [23]:
# conerting unix timestamp to actual time
from pyspark.sql.functions import from_unixtime
# from_unixtime?

In [24]:
from pyspark.sql.functions import col
movie_ratings_data = movie_ratings_data. \
                    withColumn('actualTime', from_unixtime(col('unix_timestamp')))

In [25]:
movie_ratings_data

DataFrame[userid: int, itemid: int, rating: int, unix_timestamp: int, actualTime: string]

In [26]:
#converting actual time from String to timestamp
from pyspark.sql.functions import to_timestamp

#to see how to use, also give spark doc link to see how to give date format
# to_timestamp?

In [27]:
movie_ratings_data = movie_ratings_data. \
                    withColumn('actualTime', to_timestamp(col('actualTime'), 'yyyy-MM-dd HH:mm:ss'))

In [28]:
movie_ratings_data

DataFrame[userid: int, itemid: int, rating: int, unix_timestamp: int, actualTime: timestamp]

In [29]:
movie_ratings_data.show(2)

+------+------+------+--------------+-------------------+
|userid|itemid|rating|unix_timestamp|         actualTime|
+------+------+------+--------------+-------------------+
|   196|   242|     3|     881250949|1997-12-04 21:25:49|
|   186|   302|     3|     891717742|1998-04-05 00:52:22|
+------+------+------+--------------+-------------------+
only showing top 2 rows



In [30]:
print('total records in dataset: ',movie_ratings_data.count())
print('total unique users:',movie_ratings_data.select('userid').groupby('userid').count().count())
print('total unique movies:',movie_ratings_data.select('itemid').groupby('itemid').count().count())
print('top 2 users that gave most ratings:')
movie_ratings_data.select('userid').groupby('userid').count().sort(col('count').desc()).show(2)

print('top 2 movies that got most ratings:')
movie_ratings_data.select('itemid').groupby('itemid').count().sort(col('count').desc()).show(2)

total records in dataset:  100000
total unique users: 943
total unique movies: 1682
top 2 users that gave most ratings:
+------+-----+
|userid|count|
+------+-----+
|   405|  737|
|   655|  685|
+------+-----+
only showing top 2 rows

top 2 movies that got most ratings:
+------+-----+
|itemid|count|
+------+-----+
|    50|  583|
|   258|  509|
+------+-----+
only showing top 2 rows



## EDA and cleaning movie data

In [31]:
movie_details_data

DataFrame[movieid: int, movietitle: string, releasedate: string, videoreleasedate: string, IMDbURL: string, unknown: int, Action: int, Adventure: int, Animation: int, Children's: int, Comedy: int, Crime: int, Documentary: int, Drama: int, Fantasy: int, Film-Noir: int, Horror: int, Musical: int, Mystery: int, Romance: int, Sci-Fi: int, Thriller: int, War: int, Western: int]

In [32]:
print('total records ', movie_details_data.count())
print('videoreleasedate not  null: ',movie_details_data.filter(col('videoreleasedate').isNotNull()).count())
print('videoreleasedate null: ',movie_details_data.filter(col('videoreleasedate').isNull()).count())

total records  1682
videoreleasedate not  null:  0
videoreleasedate null:  1682


In [33]:
#since all values are null
movie_details_data = movie_details_data.drop('videoreleasedate')

In [34]:
from pyspark.sql.functions import to_date
# to_date?

In [35]:
#casting release date to date type from string
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
movie_details_data = movie_details_data.withColumn('releasedate', to_date('releasedate', 'dd-MMM-yyyy'))

In [36]:
movie_details_data

DataFrame[movieid: int, movietitle: string, releasedate: date, IMDbURL: string, unknown: int, Action: int, Adventure: int, Animation: int, Children's: int, Comedy: int, Crime: int, Documentary: int, Drama: int, Fantasy: int, Film-Noir: int, Horror: int, Musical: int, Mystery: int, Romance: int, Sci-Fi: int, Thriller: int, War: int, Western: int]

In [37]:
from pyspark.sql.functions import regexp_extract, split
# regexp_extract?
# split?

In [38]:
# Signature: regexp_extract(str, pattern, idx)
movie_details_data.\
    select('IMDbURL').\
    withColumn('leadingUrl',split(col('IMDbURL'), '\\?')[0]).\
    withColumn('searchQuery', split(col('IMDbURL'), '\\?')[1]).\
    show(7, truncate=False)

+------------------------------------------------------------+--------------------------------+-----------------------------------+
|IMDbURL                                                     |leadingUrl                      |searchQuery                        |
+------------------------------------------------------------+--------------------------------+-----------------------------------+
|http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)       |http://us.imdb.com/M/title-exact|Toy%20Story%20(1995)               |
|http://us.imdb.com/M/title-exact?GoldenEye%20(1995)         |http://us.imdb.com/M/title-exact|GoldenEye%20(1995)                 |
|http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)      |http://us.imdb.com/M/title-exact|Four%20Rooms%20(1995)              |
|http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)      |http://us.imdb.com/M/title-exact|Get%20Shorty%20(1995)              |
|http://us.imdb.com/M/title-exact?Copycat%20(1995)           |http://us.imdb

## Questions

### How many movies are 5 star rated, 4 star rated ... etc

In [39]:
movie_ratings_data.show(5)

+------+------+------+--------------+-------------------+
|userid|itemid|rating|unix_timestamp|         actualTime|
+------+------+------+--------------+-------------------+
|   196|   242|     3|     881250949|1997-12-04 21:25:49|
|   186|   302|     3|     891717742|1998-04-05 00:52:22|
|    22|   377|     1|     878887116|1997-11-07 12:48:36|
|   244|    51|     2|     880606923|1997-11-27 10:32:03|
|   166|   346|     1|     886397596|1998-02-02 11:03:16|
+------+------+------+--------------+-------------------+
only showing top 5 rows



In [40]:
from pyspark.sql.functions import count
movie_ratings_data.groupby('rating').agg(count('itemid').alias('movies_count')).sort(col('rating').desc()).show()

+------+------------+
|rating|movies_count|
+------+------------+
|     5|       21201|
|     4|       34174|
|     3|       27145|
|     2|       11370|
|     1|        6110|
+------+------------+



### Finding the most popular movie

In [41]:
movie_details_data.select('movieid', 'movietitle').show(5)

+-------+-----------------+
|movieid|       movietitle|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows



In [42]:
movie_ratings_data.\
    select('itemid'). \
    groupby('itemid'). \
    count(). \
    join(movie_details_data.select('movieid', 'movietitle'),
         movie_ratings_data['itemid'] == movie_details_data['movieid'],
         how='left'
        ). \
    drop('movieid'). \
    sort(col('count').desc()). \
    show(20, truncate=False)

+------+-----+--------------------------------+
|itemid|count|movietitle                      |
+------+-----+--------------------------------+
|50    |583  |Star Wars (1977)                |
|258   |509  |Contact (1997)                  |
|100   |508  |Fargo (1996)                    |
|181   |507  |Return of the Jedi (1983)       |
|294   |485  |Liar Liar (1997)                |
|286   |481  |English Patient, The (1996)     |
|288   |478  |Scream (1996)                   |
|1     |452  |Toy Story (1995)                |
|300   |431  |Air Force One (1997)            |
|121   |429  |Independence Day (ID4) (1996)   |
|174   |420  |Raiders of the Lost Ark (1981)  |
|127   |413  |Godfather, The (1972)           |
|56    |394  |Pulp Fiction (1994)             |
|7     |392  |Twelve Monkeys (1995)           |
|98    |390  |Silence of the Lambs, The (1991)|
|237   |384  |Jerry Maguire (1996)            |
|117   |378  |Rock, The (1996)                |
|172   |367  |Empire Strikes Back, The (

#### Solution for above problem but instead of using join we will broadcast the table which we joined (with movies name), since it is kind of a lookup tables and will be used a lot
- what is broadcast variable explained in notes

In [43]:
movie_name_list = movie_details_data.select('movieid', 'movietitle').collect()
movie_name_dict = {str(movie['movieid']): movie['movietitle'] for movie in movie_name_list}
movies_name_lookup =  spark.sparkContext.broadcast(movie_name_dict)

In [44]:
# didn't work with boradcast

### Find Similar movies based on user ratings

In [45]:
import sys, time
from math import sqrt

In [46]:
scoreThreshold = 0.97
coOccurenceThreshold = 50
movieId = 50
simstrengthCut = 200

In [47]:
# filter duplicate records from ratings data
movie_ratings_data.count()

100000

In [48]:
movie_ratings_data.distinct().count()

100000

In [49]:
# Step 1: join all movie ratings with other movie ratings
# if we simply join then there will be ot of duplicates since if a record 'a' is joined with record 'b', then record 'b' will also join will record 'a'
# also record 'a' will be joined with record 'a' itslef (which is completely useless, since we'll not find movie similarity with itself)
# solution: we can join with case when 'movieid a' < 'movieid b' ... so a==a case will be eliminated and record 'b' will not join record 'a' since a<b

In [50]:
movie_ratings_data.show(2)

+------+------+------+--------------+-------------------+
|userid|itemid|rating|unix_timestamp|         actualTime|
+------+------+------+--------------+-------------------+
|   196|   242|     3|     881250949|1997-12-04 21:25:49|
|   186|   302|     3|     891717742|1998-04-05 00:52:22|
+------+------+------+--------------+-------------------+
only showing top 2 rows



In [51]:
movie_ratings_joined = movie_ratings_data.alias('mrd1').join(
    movie_ratings_data.alias('mrd2'),
    col('mrd1.itemid') < col('mrd2.itemid'),
    how = 'inner'
)
movie_ratings_joined.show(5)

+------+------+------+--------------+-------------------+------+------+------+--------------+-------------------+
|userid|itemid|rating|unix_timestamp|         actualTime|userid|itemid|rating|unix_timestamp|         actualTime|
+------+------+------+--------------+-------------------+------+------+------+--------------+-------------------+
|   196|   242|     3|     881250949|1997-12-04 21:25:49|   186|   302|     3|     891717742|1998-04-05 00:52:22|
|   196|   242|     3|     881250949|1997-12-04 21:25:49|    22|   377|     1|     878887116|1997-11-07 12:48:36|
|   196|   242|     3|     881250949|1997-12-04 21:25:49|   166|   346|     1|     886397596|1998-02-02 11:03:16|
|   196|   242|     3|     881250949|1997-12-04 21:25:49|   298|   474|     4|     884182806|1998-01-07 19:50:06|
|   196|   242|     3|     881250949|1997-12-04 21:25:49|   115|   265|     2|     881171488|1997-12-03 23:21:28|
+------+------+------+--------------+-------------------+------+------+------+----------

In [142]:
from pyspark.sql.functions import concat, lit

movie_pairs = movie_ratings_joined. \
drop('unix_timestamp', 'mrd2.unix_timestamp'). \
withColumn('movieid_pairs', concat('mrd1.itemid', lit('-'), 'mrd2.itemid')). \
withColumn('rating_pairs', concat('mrd1.rating', lit('-'), 'mrd2.rating'))

movie_pairs.show(5)

+------+------+------+-------------------+------+------+------+-------------------+-------------+------------+
|userid|itemid|rating|         actualTime|userid|itemid|rating|         actualTime|movieid_pairs|rating_pairs|
+------+------+------+-------------------+------+------+------+-------------------+-------------+------------+
|   196|   242|     3|1997-12-04 21:25:49|   186|   302|     3|1998-04-05 00:52:22|      242-302|         3-3|
|   196|   242|     3|1997-12-04 21:25:49|    22|   377|     1|1997-11-07 12:48:36|      242-377|         3-1|
|   196|   242|     3|1997-12-04 21:25:49|   166|   346|     1|1998-02-02 11:03:16|      242-346|         3-1|
|   196|   242|     3|1997-12-04 21:25:49|   298|   474|     4|1998-01-07 19:50:06|      242-474|         3-4|
|   196|   242|     3|1997-12-04 21:25:49|   115|   265|     2|1997-12-03 23:21:28|      242-265|         3-2|
+------+------+------+-------------------+------+------+------+-------------------+-------------+------------+
o

In [143]:
movie_pairs

DataFrame[userid: int, itemid: int, rating: int, actualTime: timestamp, userid: int, itemid: int, rating: int, actualTime: timestamp, movieid_pairs: string, rating_pairs: string]

In [144]:
movie_pairs. \
count()

4991596405

In [145]:
movie_pairs_filtered = movie_pairs.filter((col('mrd1.itemid')==50) | (col('mrd2.itemid')==50))

In [54]:
# movie_pairs_filtered. \
# select('movieid_pairs', 'rating_pairs'). \
# groupBy('movieid_pairs'). \
# count()

DataFrame[movieid_pairs: string, count: bigint]

In [146]:
from pyspark.sql.functions import collect_list
movie_rating_pairs_by_id = movie_pairs_filtered. \
select('movieid_pairs', 'rating_pairs'). \
groupBy('movieid_pairs'). \
agg(collect_list('rating_pairs').alias('rating_pairs_list'))

In [147]:
movie_rating_pairs_by_id.count()

1681

In [148]:
movie_rating_pairs_by_id.show()

+-------------+--------------------+
|movieid_pairs|   rating_pairs_list|
+-------------+--------------------+
|       50-110|[4-3, 4-1, 4-3, 4...|
|      50-1532|[4-2, 4-2, 4-2, 4...|
|       50-238|[5-1, 5-4, 5-2, 5...|
|       50-542|[4-2, 2-2, 4-2, 2...|
|       50-942|[4-3, 4-4, 4-3, 4...|
|       50-956|[5-4, 3-4, 5-1, 3...|
|      50-1115|[5-5, 5-3, 5-3, 5...|
|      50-1237|[4-3, 5-3, 4-3, 5...|
|       50-129|[5-4, 4-5, 5-3, 4...|
|      50-1678|[3-1, 5-1, 4-1, 5...|
|       50-277|[5-5, 3-4, 5-4, 3...|
|       50-349|[5-3, 5-2, 5-3, 5...|
|       50-620|[5-4, 4-4, 5-3, 4...|
|       50-840|[4-2, 5-2, 4-3, 5...|
|       50-921|[3-5, 1-5, 3-5, 1...|
|       50-933|[5-3, 5-4, 5-1, 5...|
|      50-1081|[4-3, 5-1, 4-3, 5...|
|      50-1364|[5-1, 2-1, 5-1, 5...|
|      50-1460|[5-3, 2-3, 4-3, 4...|
|       50-209|[5-3, 5-5, 5-5, 5...|
+-------------+--------------------+
only showing top 20 rows



In [149]:
def findCosineSimilarity(ratings_pairs):
    nominator = 0
    denom1 = 0
    denom2 = 0

    for rp in ratings_pairs:
        r1, r2 = rp.split('-')
        r1 = int(r1.strip())
        r2 = int(r2.strip())
        
#         return str(r1)+'-'+str(r2)
    
        nominator += (r1*r2)
        denom1 += (r1*r1)
        denom2 +=(r2*r2)
        
    denom = sqrt(denom1)*sqrt(denom2)
    similarity = 0
    if denom!=0:
        similarity = nominator/(denom)
    return similarity#, len(ratings_pairs)

In [150]:
from pyspark.sql.functions import udf
cosSimUdf = udf(lambda rating_pair: findCosineSimilarity(rating_pair))

In [151]:
movie_similarity = movie_rating_pairs_by_id. \
withColumn('similarityValue', cosSimUdf(col('rating_pairs_list')))

In [152]:
#because we don't want to recompute it again and again everytime we call for 2 different movies
movie_similarity.cache()

DataFrame[movieid_pairs: string, rating_pairs_list: array<string>, similarityValue: string]

In [None]:
movie_similarity.count()

In [None]:
movie_similarity. \
filter(size(col('rating_pairs_list'))>50). \
count()

In [None]:
from pyspark.sql.functions import size, split
movie_similarity. \
show(50)

# withColumn('numOfRatings', size(col('rating_pairs_list'))). \
# filter(col('numOfRatings')>coOccurenceThreshold). \
# filter(col('similarityValue')>scoreThreshold). \
# drop('rating_pairs_list'). \
# sort(col('similarityValue').desc()). \
# withColumn('movieMatched', split('movieid_pairs', ',')[1].cast('int')). \
# join(movie_details_data.select('movieid', 'movietitle'),
#          col('movieMatched') == movie_details_data['movieid'],
#          how='left'). \
# drop('movieMatched', 'movieid'). \
# show(50, truncate=False)