# Setup

In [1]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyspark
from pyspark.sql import SparkSession

# Create Spark Session with Hive enabled
spark = SparkSession\
        .builder\
        .master("local")\
        .appName("jupyter-spark")\
        .config("hive.metastore.uris", "thrift://hive-metastore:9083")\
        .config("spark.sql.warehouse.dir","/users/hive/warehouse")\
        .config("spark.hadoop.fs.s3a.fast.upload", True) \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .enableHiveSupport()\
        .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

23/06/26 02:19:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# import opendatasets as od
# od.download("https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset")

In [3]:
!mkdir ~/.kaggle

In [5]:
!ls ~/.kaggle

kaggle.json


In [2]:
import os
import kaggle
kaggle.api.dataset_download_files('grouplens/movielens-20m-dataset', path='movielens', unzip=True)

ModuleNotFoundError: No module named 'kaggle'

# Reading Data Frames

In [2]:
from pyspark.sql.functions import split, col, explode, to_timestamp, concat_ws

from pyspark.sql import functions as f
movie = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/movie.csv")\
        .select("movieId", "title", split(col("genres"), "\\|").alias("genres"))

rating = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/rating.csv")\
         .withColumn("timestamp",to_timestamp("timestamp"))

link = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/link.csv")
tag = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/tag.csv")

genome_tags = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/genome_tags.csv")
genome_score = spark.read.option("inferSchema", "true").option("header","true").csv("./movielens/genome_scores.csv")

                                                                                

## Creating the dataframe for algorithm

In [3]:
movie_rating = rating.groupBy("movieId").agg(f.mean("rating").alias("avg_rating"), f.count("movieId").alias("number_of_votes"))                

In [4]:
relevance_scores = genome_score.join(genome_tags, genome_score["tagId"] == genome_tags["tagId"])\
                    .select(genome_score.movieId,genome_score.tagId,genome_score.relevance, genome_tags.tag)

In [5]:
# recommendation_df = movie.join(movie_rating, movie["movieId"] == movie_rating["movieId"])\
#                     .join(tag, movie["movieId"] == tag["movieId"])\
#                     .select(movie.movieId, movie.title,explode(movie.genres).alias("genres"), movie_rating.avg_rating,
#                             movie_rating.number_of_votes, tag.tag)\
#                     .groupBy("movieId").agg(f.collect_list("tag").alias("tags"), 
#                                          f.collect_set("genres").alias("genres"))\
#                     .select("movieId", concat_ws(" ", "tags").alias("tags"), concat_ws(" ","genres").alias("genres"))

In [5]:
recommendation_df = movie.join(movie_rating, movie["movieId"] == movie_rating["movieId"])\
                    .join(tag, movie["movieId"] == tag["movieId"])\
                    .select(movie.movieId, movie.title,explode(movie.genres).alias("genres"), movie_rating.avg_rating,
                            movie_rating.number_of_votes, tag.tag)\
                    .groupBy("movieId").agg(f.collect_set("tag").alias("tags"), 
                                         f.collect_set("genres").alias("genres"))\
                    .select("movieId", concat_ws(" ", "tags","genres").alias("tags_genres"))

# Feature Engineering

In [6]:
import requests
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, VectorAssembler, IDF, Normalizer

stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()

tags_genres_tokenizer = RegexTokenizer().setInputCol('tags_genres').setOutputCol('token_tags_genres')
remove_stop_words = StopWordsRemover().setStopWords(stop_words)\
                        .setCaseSensitive(False).setInputCol("token_tags_genres").setOutputCol("filtered_tags_genres")

count_vectorizer = CountVectorizer().setInputCol("filtered_tags_genres").setOutputCol("tf_tags_genres")

idf = IDF().setInputCol('tf_tags_genres').setOutputCol('tfidf_tags_genres')

normalizer = Normalizer(inputCol="tfidf_tags_genres", outputCol="features")

In [7]:
df_training, df_test = recommendation_df.randomSplit([0.7,0.3],0)
fe_pipe = Pipeline(stages = [tags_genres_tokenizer,remove_stop_words, count_vectorizer, idf, normalizer])

In [None]:
from pyspark.ml.functions import vector_to_array
features = fe_pipe.fit(df_training).transform(df_training).select("features").rdd.flatMap(lambda x: x).collect()
# array = [row.features.toArray().tolist() for row in array]

In [None]:
x = []
for i in features:
    x.append(i.toArray())

In [None]:
y = list(map(list,x))