# Setup

In [1]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
from pyspark.sql import SparkSession

# Create Spark Session with Hive enabled
spark = SparkSession\
        .builder\
        .master("local")\
        .appName("jupyter-spark")\
        .config("hive.metastore.uris", "thrift://hive-metastore:9083")\
        .config("spark.sql.warehouse.dir","/users/hive/warehouse")\
        .config("spark.hadoop.fs.s3a.fast.upload", True) \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .enableHiveSupport()\
        .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

23/06/24 23:52:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# import opendatasets as od
# od.download("https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset")

In [4]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/home/jovyan/.kaggle’: File exists


In [5]:
!ls ~/.kaggle

kaggle.json


In [6]:
import os
import kaggle
kaggle.api.dataset_download_files('grouplens/movielens-20m-dataset', path='movielens', unzip=True)



# Reading Data Frames

In [7]:
from pyspark.sql.functions import split, col
from pyspark.sql.functions import to_timestamp
from pyspark.sql import functions as f
movie = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/movie.csv")\
        .select("movieId", "title", split(col("genres"), "\\|").alias("genres"))

rating = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/rating.csv")\
         .withColumn("timestamp",to_timestamp("timestamp"))

link = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/link.csv")
tag = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/tag.csv")

genome_tags = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/genome_tags.csv")
genome_score = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/genome_scores.csv")

                                                                                

## Creating the dataframe for algorithm

In [8]:
movie_rating = rating.groupBy("movieId").agg(f.mean("rating").alias("avg_rating"), f.count("movieId").alias("number_of_votes"))                

In [9]:
relevance_scores = genome_score.join(genome_tags, genome_score["tagId"] == genome_tags["tagId"])\
                    .select(genome_score.movieId,genome_score.tagId,genome_score.relevance, genome_tags.tag)

In [10]:
relevance_scores.show(5)

+-------+-----+---------+------------+
|movieId|tagId|relevance|         tag|
+-------+-----+---------+------------+
|      1|    1|    0.025|         007|
|      1|    2|    0.025|007 (series)|
|      1|    3|  0.05775|18th century|
|      1|    4|  0.09675|       1920s|
|      1|    5|  0.14675|       1930s|
+-------+-----+---------+------------+
only showing top 5 rows



In [11]:
recommendation_df = movie.join(movie_rating, movie["movieId"] == movie_rating["movieId"])\
                    .join(tag, movie["movieId"] == tag["movieId"])\
                    .select(movie.movieId, movie.title,movie.genres, movie_rating.avg_rating,
                            movie_rating.number_of_votes, tag.tag)

In [12]:
recommendation_df.show(20, truncate = False)

[Stage 16:>                                                         (0 + 1) / 1]

+-------+--------------------------------+--------+------------------+---------------+---------------------------+
|movieId|title                           |genres  |avg_rating        |number_of_votes|tag                        |
+-------+--------------------------------+--------+------------------+---------------+---------------------------+
|148    |Awfully Big Adventure, An (1995)|[Drama] |2.8893557422969187|357            |Nudity (Topless - Notable) |
|148    |Awfully Big Adventure, An (1995)|[Drama] |2.8893557422969187|357            |nudity (topless)           |
|471    |Hudsucker Proxy, The (1994)     |[Comedy]|3.6641817536386228|11268          |funny                      |
|471    |Hudsucker Proxy, The (1994)     |[Comedy]|3.6641817536386228|11268          |Capra-esque                |
|471    |Hudsucker Proxy, The (1994)     |[Comedy]|3.6641817536386228|11268          |1950s                      |
|471    |Hudsucker Proxy, The (1994)     |[Comedy]|3.6641817536386228|11268     

                                                                                