In [1]:
from recommender import Recommender
from pyspark.sql import functions as F
import numpy as np

In [2]:
# Load restaurant reviews
reviews_df = spark.read.parquet('../data/ratings_ugt10_igt10')

# Randomly split data into train and test datasets
train_df, test_df = reviews_df.randomSplit(weights=[0.75, 0.25])

print(train_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)

None


In [3]:
estimator = Recommender(
    useALS=True,
    useBias=True,
    lambda_1=7,
    lambda_2=12,
    userCol='user',
    itemCol='item',
    ratingCol='rating',
    rank=76,
    regParam=0.7,
    maxIter=15,
    nonnegative=True
)
model = estimator.fit(train_df)

train_predictions_df = model.transform(train_df)
test_predictions_df = model.transform(test_df)

print(test_predictions_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)
 |-- prediction: double (nullable = true)

None


In [4]:
train_predictions_df.registerTempTable('train_predictions_df')
test_predictions_df.registerTempTable('test_predictions_df')
df1 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    row_number() over (
        partition by user
        order by prediction desc
    ) as pred_row_num,
    row_number() over (
        partition by user
        order by rating desc
    ) as actual_row_num
from train_predictions_df
where user = 3000
order by pred_row_num
'''
)


df2 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    row_number() over (
        partition by user
        order by prediction desc
    ) as pred_row_num,
    row_number() over (
        partition by user
        order by rating desc
    ) as actual_row_num
from test_predictions_df
where user = 3000
order by pred_row_num
'''
)

print(df1.show(100))
print(df2.show(100))

+----+----+------+------------------+------------+--------------+
|user|item|rating|        prediction|pred_row_num|actual_row_num|
+----+----+------+------------------+------------+--------------+
|3000|  11|     4|3.0949144066650085|           1|             7|
|3000|1173|     3|3.0374854226477215|           2|            18|
|3000| 565|     1| 2.991747683650382|           3|            34|
|3000|1408|     5| 2.981715002977369|           4|             1|
|3000|1277|     4|2.9781366370583475|           5|             8|
|3000| 460|     3|2.9762744504833005|           6|            19|
|3000|  81|     3| 2.919607556222398|           7|            20|
|3000| 358|     4| 2.903306228200397|           8|             9|
|3000|1159|     5| 2.893977628653226|           9|             2|
|3000| 155|     5|2.8430654364442827|          10|             3|
|3000|  84|     4| 2.837763715041344|          11|            10|
|3000| 474|     4| 2.800672962872026|          12|            11|
|3000| 445

In [5]:
user_id = 3000

new_user_df = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction as orig_prediction
from train_predictions_df
where user = {}
'''.format(user_id)
)

new_user_validate_df = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction as orig_prediction
from test_predictions_df
where user = {}
'''.format(user_id)
)

print(new_user_df.show(100))
print(new_user_validate_df.show(100))

+----+----+------+------------------+
|user|item|rating|   orig_prediction|
+----+----+------+------------------+
|3000| 155|     5|2.8430654364442827|
|3000|2294|     2| 2.564530123006623|
|3000|  81|     3| 2.919607556222398|
|3000|1019|     4|  2.48681511022155|
|3000| 192|     5|2.1337592354476573|
|3000| 460|     3|2.9762744504833005|
|3000|1207|     3| 2.713594495004676|
|3000|1300|     1| 2.494846757203237|
|3000| 297|     3|  2.20036781489066|
|3000|  16|     4|2.4705527150363586|
|3000| 474|     4| 2.800672962872026|
|3000|2740|     2|2.3692679768431013|
|3000|1002|     1|1.7181786352916024|
|3000| 358|     4| 2.903306228200397|
|3000| 819|     1|1.3909120061981275|
|3000|1159|     5| 2.893977628653226|
|3000| 445|     4|   2.7289693045342|
|3000|1779|     3| 2.452615929873703|
|3000| 930|     1|2.4709433543477344|
|3000|  84|     4| 2.837763715041344|
|3000| 565|     1| 2.991747683650382|
|3000| 361|     4|2.7284773281319072|
|3000| 952|     2|2.0782028550354994|
|3000| 261| 

In [6]:
# Pull out the item H matrix
item_factors_df = model.itemFactors
user_factors_df = model.userFactors.filter('id={}'.format(user_id))
user_factors = np.array(user_factors_df.collect()[0]['features'])
print(len(user_factors))
print(user_factors)
filtered_item_factors_df = item_factors_df.join(new_user_df, F.col('id') == new_user_df['item'])
print(filtered_item_factors_df.show(100))

76
[ 0.1623283   0.19345151  0.22007252  0.22779582  0.22266911  0.33563021
  0.28995195  0.22185205  0.21927908  0.23160799  0.28811061  0.16876173
  0.28022727  0.18573907  0.17375977  0.2938095   0.24375667  0.19470195
  0.2568163   0.283968    0.22474007  0.20255344  0.1972983   0.20657775
  0.24283178  0.23534305  0.23879254  0.28207177  0.17877565  0.21761256
  0.23177049  0.29462013  0.18488821  0.17377879  0.23153818  0.17143112
  0.18297844  0.22511117  0.23511869  0.26382938  0.2140002   0.22547124
  0.30830786  0.26909924  0.15832277  0.21443108  0.1754979   0.26347065
  0.21452574  0.20542628  0.26338708  0.2898446   0.26029924  0.20103346
  0.21371995  0.20121582  0.22617234  0.29525763  0.18978471  0.2893244
  0.18242313  0.17901675  0.28173032  0.24256478  0.21559869  0.2507695
  0.24078988  0.2373161   0.26078814  0.14995316  0.24594522  0.27095541
  0.22067171  0.25132895  0.17968152  0.1838176 ]
+----+--------------------+----+----+------+------------------+
|  id|   

In [7]:
filtered_item_factors = []
item_ratings = []
for row in filtered_item_factors_df.collect():
    filtered_item_factors.append(row['features'])
    item_ratings.append(row['rating'])
filtered_item_factors = np.array(filtered_item_factors)
item_ratings = np.array(item_ratings)
print(filtered_item_factors.shape)
print(filtered_item_factors)
print(item_ratings.shape)
print(item_ratings)

(38, 76)
[[ 0.16679585  0.198374    0.22608361 ...,  0.25760639  0.18432626
   0.18859579]
 [ 0.16684169  0.19941826  0.22625808 ...,  0.25925428  0.18511671
   0.18933754]
 [ 0.16584307  0.1975833   0.22483103 ...,  0.25668088  0.18352941
   0.18775788]
 ..., 
 [ 0.1668524   0.19886562  0.22620821 ...,  0.2583704   0.18470617
   0.18895623]
 [ 0.16906574  0.20131761  0.22918814 ...,  0.26150069  0.18701707
   0.19133283]
 [ 0.16984051  0.20285223  0.23030975 ...,  0.26367012  0.18833375
   0.19263792]]
(38,)
[5 2 3 4 5 3 3 1 3 4 4 2 1 4 1 5 4 3 1 4 1 4 2 3 3 3 5 3 5 4 4 3 3 4 5 2 4
 3]


In [8]:
new_user_factors = np.dot(item_ratings, filtered_item_factors)/ sum(item_ratings)
print(sum(item_ratings), item_ratings.mean())
print((new_user_factors / user_factors).mean())
# 35 * 3.5 ~ 122.6 # user 3000, sum(ratings) = 120, avg(ratings) = 3.4285714
# 40 * 3.5 ~ 142.4 # user 3001, sum(ratings) = 143, avg(ratings) = 3.575
# 33 * 4.2 ~ 138.3 # user 3002, sum(ratings) = 144, avg(ratings) = 4.3636
print(user_factors.shape)
print(user_factors)
print(new_user_factors.shape)
print(new_user_factors)
print(new_user_factors / user_factors)

123 3.23684210526
1.0260484568
(76,)
[ 0.1623283   0.19345151  0.22007252  0.22779582  0.22266911  0.33563021
  0.28995195  0.22185205  0.21927908  0.23160799  0.28811061  0.16876173
  0.28022727  0.18573907  0.17375977  0.2938095   0.24375667  0.19470195
  0.2568163   0.283968    0.22474007  0.20255344  0.1972983   0.20657775
  0.24283178  0.23534305  0.23879254  0.28207177  0.17877565  0.21761256
  0.23177049  0.29462013  0.18488821  0.17377879  0.23153818  0.17143112
  0.18297844  0.22511117  0.23511869  0.26382938  0.2140002   0.22547124
  0.30830786  0.26909924  0.15832277  0.21443108  0.1754979   0.26347065
  0.21452574  0.20542628  0.26338708  0.2898446   0.26029924  0.20103346
  0.21371995  0.20121582  0.22617234  0.29525763  0.18978471  0.2893244
  0.18242313  0.17901675  0.28173032  0.24256478  0.21559869  0.2507695
  0.24078988  0.2373161   0.26078814  0.14995316  0.24594522  0.27095541
  0.22067171  0.25132895  0.17968152  0.1838176 ]
(76,)
[ 0.16652874  0.19852938  0.22577

In [9]:
# make predictions for "new user"
item_factors = []
item_ids = []
for row in item_factors_df.collect():
    item_factors.append(row['features'])
    item_ids.append(row['id'])
item_factors = np.array(item_factors)
item_ids = np.array(item_ids)
print(item_factors.shape)
print(item_ids.shape)

new_predictions = np.dot(new_user_factors, item_factors.T)
print(new_predictions.shape)
print(new_predictions)

(5054, 76)
(5054,)
(5054,)
[ 4.312205    4.32212275  4.3079107  ...,  5.8246084   4.1878941
  4.45842376]


In [10]:
new_prediction_df = spark.createDataFrame(zip(item_ids.tolist(), new_predictions.tolist()), ['item', 'prediction'])

In [11]:
new_prediction_df.count()

5054

In [12]:
new_prediction_df.registerTempTable('new_prediction_df')
new_user_df.registerTempTable('new_user_df')
new_user_validate_df.registerTempTable('new_user_validate_df')

compare_df = spark.sql(
'''
select
    n.item, n.user, n.rating, n.orig_prediction, p.prediction,
    row_number() over (
        partition by n.user
        order by n.rating desc
    ) as actual_row_num,
    row_number() over (
        partition by n.user
        order by n.orig_prediction desc
    ) as orig_row_num,
    row_number() over (
        partition by n.user
        order by p.prediction desc
    ) as new_row_num
from new_user_df n
join new_prediction_df p on n.item = p.item
order by new_row_num
'''
)

compare_validate_df = spark.sql(
'''
select
    n.item, n.user, n.rating, n.orig_prediction, p.prediction,
    row_number() over (
        partition by n.user
        order by n.rating desc
    ) as actual_row_num,
    row_number() over (
        partition by n.user
        order by n.orig_prediction desc
    ) as orig_row_num,
    row_number() over (
        partition by n.user
        order by p.prediction desc
    ) as new_row_num
from new_user_validate_df n
join new_prediction_df p on n.item = p.item
order by new_row_num
'''
)

print(compare_df.show(100))
print(compare_validate_df.show(100))

+----+----+------+------------------+------------------+--------------+------------+-----------+
|item|user|rating|   orig_prediction|        prediction|actual_row_num|orig_row_num|new_row_num|
+----+----+------+------------------+------------------+--------------+------------+-----------+
|1138|3000|     3|2.6032174262773022| 4.413060347142474|            24|          20|          1|
|1300|3000|     1| 2.494846757203237| 4.405559510597968|            38|          22|          2|
| 358|3000|     4| 2.903306228200397|  4.39684888317721|            15|           8|          3|
|1277|3000|     4|2.9781366370583475|4.3854734372046265|             8|           5|          4|
| 460|3000|     3|2.9762744504833005| 4.367604764344131|            21|           6|          5|
|1173|3000|     3|3.0374854226477215|  4.35455615173769|            26|           2|          6|
| 361|3000|     4|2.7284773281319072| 4.342957032966872|            10|          14|          7|
| 297|3000|     3|  2.20036781