<a href="https://colab.research.google.com/github/rossrco/experiments/blob/recomm_markdown/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 72kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 39.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=12288bb33d7966d2fc5c18eb4f13342d08947707a574f8192ff718b7925cdea0
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RankingEvaluator

spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
def download_dataset():
  print('Downloading movielens data...')
  from urllib.request import urlretrieve
  import zipfile

  url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
  dest_file = 'movielens.zip'

  urlretrieve(url, dest_file)
  zip_ref = zipfile.ZipFile(dest_file, 'r')
  zip_ref.extractall()
  print('Done. Dataset contains:')
  print(zip_ref.read('ml-100k/u.info'))


def read_ratings():
  print('Reading the ratings file...')
  ratings_schema = T.StructType(
      [T.StructField('user_id', T.DoubleType(), False),
       T.StructField('movie_id', T.DoubleType(), True),
       T.StructField('rating', T.DoubleType(), True),
       T.StructField('unix_timestamp', T.LongType(), True)]
       )

  ratings = (spark.read
             .load('ml-100k/u.data', format='csv', sep='\t',
                   header='false', schema=ratings_schema)
             .withColumn('unix_timestamp',
                         F.to_timestamp(F.col('unix_timestamp'))))

  print(f'Ingested {ratings.count()} ratings.')
  return ratings

In [None]:
download_dataset()

ratings = read_ratings()

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'
Reading the ratings file...
Ingested 100000 ratings.


In [None]:
ratings.show()

+-------+--------+------+-------------------+
|user_id|movie_id|rating|     unix_timestamp|
+-------+--------+------+-------------------+
|  196.0|   242.0|   3.0|1997-12-04 15:55:49|
|  186.0|   302.0|   3.0|1998-04-04 19:22:22|
|   22.0|   377.0|   1.0|1997-11-07 07:18:36|
|  244.0|    51.0|   2.0|1997-11-27 05:02:03|
|  166.0|   346.0|   1.0|1998-02-02 05:33:16|
|  298.0|   474.0|   4.0|1998-01-07 14:20:06|
|  115.0|   265.0|   2.0|1997-12-03 17:51:28|
|  253.0|   465.0|   5.0|1998-04-03 18:34:27|
|  305.0|   451.0|   3.0|1998-02-01 09:20:17|
|    6.0|    86.0|   3.0|1997-12-31 21:16:53|
|   62.0|   257.0|   2.0|1997-11-12 22:07:14|
|  286.0|  1014.0|   5.0|1997-11-17 15:38:45|
|  200.0|   222.0|   5.0|1997-10-05 09:05:40|
|  210.0|    40.0|   3.0|1998-03-27 21:59:54|
|  224.0|    29.0|   3.0|1998-02-21 23:40:57|
|  303.0|   785.0|   3.0|1997-11-14 05:28:38|
|  122.0|   387.0|   5.0|1997-11-11 17:47:39|
|  194.0|   274.0|   2.0|1997-11-14 20:36:34|
|  291.0|  1042.0|   4.0|1997-09-2

In [None]:
train, test = ratings.randomSplit(weights=[0.8, 0.2], seed=42)

In [None]:
model = ALS(userCol='user_id',
            itemCol='movie_id',
            ratingCol='rating').fit(train)

In [None]:
k = 3
test_recomm = model.recommendForUserSubset(dataset=test, numItems=k)
test_recomm = test_recomm.withColumn('recommended_movies',
                                     F.col('recommendations').movie_id.cast(T.ArrayType(T.DoubleType())))
test_recomm.show(truncate=False)

+-------+---------------------------------------------------------+------------------------+
|user_id|recommendations                                          |recommended_movies      |
+-------+---------------------------------------------------------+------------------------+
|471    |[{862, 5.117396}, {680, 4.810842}, {349, 4.665866}]      |[862.0, 680.0, 349.0]   |
|463    |[{1240, 4.529101}, {1449, 4.3274264}, {1104, 4.1321487}] |[1240.0, 1449.0, 1104.0]|
|833    |[{1368, 5.0700154}, {1597, 4.704284}, {320, 4.494964}]   |[1368.0, 1597.0, 320.0] |
|496    |[{695, 4.4890313}, {1449, 4.4207177}, {1022, 4.399894}]  |[695.0, 1449.0, 1022.0] |
|148    |[{1129, 5.558813}, {1084, 5.403372}, {1410, 5.206335}]   |[1129.0, 1084.0, 1410.0]|
|540    |[{1449, 5.127593}, {1398, 4.7999997}, {1643, 4.693313}]  |[1449.0, 1398.0, 1643.0]|
|392    |[{1643, 5.569833}, {1449, 5.0185084}, {1398, 5.0121846}] |[1643.0, 1449.0, 1398.0]|
|243    |[{1449, 4.6866884}, {1398, 4.4519615}, {1628, 4.286181}] |[14

In [None]:
# window partition over user_id sort by rating

test_pivot = (test
              .withColumn('id_ratings', F.create_map('movie_id', 'rating'))
              .groupBy('user_id').agg(
                  F.collect_list('movie_id').alias('movie_list'),
                  F.collect_list('rating').alias('ratings_list'),
                  F.collect_list('id_ratings').alias('id_ratings_list')))

In [None]:
test_pivot.show()

+-------+--------------------+--------------------+--------------------+
|user_id|          movie_list|        ratings_list|     id_ratings_list|
+-------+--------------------+--------------------+--------------------+
|  299.0|[1.0, 19.0, 20.0,...|[3.0, 1.0, 3.0, 3...|[{1.0 -> 3.0}, {1...|
|  305.0|[2.0, 13.0, 15.0,...|[2.0, 3.0, 1.0, 3...|[{2.0 -> 2.0}, {1...|
|  496.0|[17.0, 88.0, 99.0...|[3.0, 1.0, 3.0, 5...|[{17.0 -> 3.0}, {...|
|  558.0|             [508.0]|               [5.0]|    [{508.0 -> 5.0}]|
|  596.0|[288.0, 289.0, 30...|[4.0, 3.0, 4.0, 3.0]|[{288.0 -> 4.0}, ...|
|  692.0|[1.0, 25.0, 168.0...|[4.0, 4.0, 2.0, 4...|[{1.0 -> 4.0}, {2...|
|  769.0|[237.0, 685.0, 74...|     [3.0, 3.0, 2.0]|[{237.0 -> 3.0}, ...|
|  934.0|[65.0, 88.0, 121....|[4.0, 4.0, 3.0, 3...|[{65.0 -> 4.0}, {...|
|  147.0|[258.0, 313.0, 93...|     [4.0, 4.0, 3.0]|[{258.0 -> 4.0}, ...|
|  170.0|[300.0, 304.0, 32...|[5.0, 4.0, 5.0, 3...|[{300.0 -> 5.0}, ...|
|  184.0|[44.0, 47.0, 51.0...|[4.0, 4.0, 4.0, 4...|

In [None]:
eval_set = test_recomm.join(test_pivot, on='user_id', how='inner')

In [None]:
eval_set.show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|user_id|     recommendations|  recommended_movies|          movie_list|        ratings_list|     id_ratings_list|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    471|[{862, 5.117396},...|[862.0, 680.0, 34...|[1.0, 95.0, 393.0...|[4.0, 4.0, 5.0, 5...|[{1.0 -> 4.0}, {9...|
|    463|[{1240, 4.529101}...|[1240.0, 1449.0, ...|[24.0, 50.0, 117....|[3.0, 4.0, 3.0, 5...|[{24.0 -> 3.0}, {...|
|    833|[{1368, 5.0700154...|[1368.0, 1597.0, ...|[11.0, 22.0, 47.0...|[5.0, 3.0, 5.0, 3...|[{11.0 -> 5.0}, {...|
|    496|[{695, 4.4890313}...|[695.0, 1449.0, 1...|[17.0, 88.0, 99.0...|[3.0, 1.0, 3.0, 5...|[{17.0 -> 3.0}, {...|
|    148|[{1129, 5.558813}...|[1129.0, 1084.0, ...|[8.0, 56.0, 71.0,...|[4.0, 5.0, 5.0, 5...|[{8.0 -> 4.0}, {5...|
|    540|[{1449, 5.127593}...|[1449.0, 1398.0, ...|[13.0, 147.0, 222...|[4.0, 3.

In [None]:
eval = RankingEvaluator(predictionCol='recommended_movies', labelCol='movie_list', metricName='meanAveragePrecision', k=k)

Precision @ K: $p(k)=\frac{1}{M}\sum_{i=0}^{M-1}\frac{1}{k}\sum_{j=0}^{min(Q_i, k)^-1}rel_{D_i}(R_i(j))$

In [None]:
eval.evaluate(eval_set)

0.0014198488526438542