In [1]:
from recommender import Recommender
from pyspark.sql import functions as F
import numpy as np

In [2]:
# Load restaurant reviews
reviews_df = spark.read.parquet('../data/ratings_ugt10_igt10')

# Randomly split data into train and test datasets
train_df, test_df = reviews_df.randomSplit(weights=[0.75, 0.25])

print(train_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)

None


In [3]:
estimator = Recommender(
    useALS=True,
    useBias=True,
    lambda_1=7,
    lambda_2=12,
    userCol='user',
    itemCol='item',
    ratingCol='rating',
    rank=76,
    regParam=0.7,
    maxIter=15,
    nonnegative=False
)
model = estimator.fit(train_df)

train_predictions_df = model.transform(train_df)
test_predictions_df = model.transform(test_df)

print(test_predictions_df.printSchema())

Fit done in 69.23029148101341 seconds
Transform done in 0.2065514309797436 seconds
Transform done in 0.26547738403314725 seconds
root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)
 |-- prediction: double (nullable = true)

None


In [4]:
train_predictions_df.registerTempTable('train_predictions_df')
test_predictions_df.registerTempTable('test_predictions_df')
df1 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    row_number() over (
        partition by user
        order by prediction desc
    ) as pred_row_num,
    row_number() over (
        partition by user
        order by rating desc
    ) as actual_row_num
from train_predictions_df
where user = 3000
order by pred_row_num
'''
)


df2 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    row_number() over (
        partition by user
        order by prediction desc
    ) as pred_row_num,
    row_number() over (
        partition by user
        order by rating desc
    ) as actual_row_num
from test_predictions_df
where user = 3000
order by pred_row_num
'''
)

print(df1.show(100))
print(df2.show(100))

+----+----+------+------------------+------------+--------------+
|user|item|rating|        prediction|pred_row_num|actual_row_num|
+----+----+------+------------------+------------+--------------+
|3000| 480|     4| 4.156959697988615|           1|            14|
|3000|1408|     5|3.9188534476927734|           2|             4|
|3000|1159|     5| 3.891992777999069|           3|             3|
|3000| 460|     3|3.8918789298133114|           4|            18|
|3000|  84|     4|3.8664549280181117|           5|            12|
|3000|1277|     4|3.8565775172204755|           6|            16|
|3000| 155|     5| 3.849672341016376|           7|             1|
|3000| 358|     4|3.8364246954375174|           8|            10|
|3000| 474|     4|3.8229278745707402|           9|             8|
|3000| 445|     4|3.7888557441245867|          10|            11|
|3000| 261|     3| 3.732590468515996|          11|            20|
|3000|  83|     4| 3.715063090416448|          12|            15|
|3000| 361

In [6]:
user_id = 3000

new_user_df = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction as orig_prediction
from train_predictions_df
where user = {}
'''.format(user_id)
)

new_user_validate_df = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction as orig_prediction
from test_predictions_df
where user = {}
'''.format(user_id)
)

print(new_user_df.show(100))
print(new_user_validate_df.show(100))

+----+----+------+------------------+
|user|item|rating|   orig_prediction|
+----+----+------+------------------+
|3000| 155|     5| 3.849672341016376|
|3000| 115|     3|3.5671699145522853|
|3000| 192|     5|3.3236665570569057|
|3000| 460|     3|3.8918789298133114|
|3000|1300|     1|3.6326871941913663|
|3000|1399|     4|3.4864800113879246|
|3000| 474|     4|3.8229278745707402|
|3000| 920|     4|3.6668854801646726|
|3000|2740|     2|3.6934577708616523|
|3000|1002|     1| 3.137779689127302|
|3000| 358|     4|3.8364246954375174|
|3000| 819|     1| 2.813132299544523|
|3000|1159|     5| 3.891992777999069|
|3000| 445|     4|3.7888557441245867|
|3000|1779|     3| 3.608781551857008|
|3000| 930|     1|3.6044224141105414|
|3000|  84|     4|3.8664549280181117|
|3000| 361|     4|3.6976815655068678|
|3000| 261|     3| 3.732590468515996|
|3000| 480|     4| 4.156959697988615|
|3000| 712|     3| 3.492542564480853|
|3000|1408|     5|3.9188534476927734|
|3000|  83|     4| 3.715063090416448|
|3000| 546| 

In [7]:
# Pull out the item H matrix
item_factors_df = model.itemFactors
user_factors_df = model.userFactors.filter('id={}'.format(user_id))
user_factors = np.array(user_factors_df.collect()[0]['features'])
print(len(user_factors))
print(user_factors)
filtered_item_factors_df = item_factors_df.join(new_user_df, F.col('id') == new_user_df['item'])
print(filtered_item_factors_df.show(100))

76
[ -1.20576360e-07  -8.76581154e-08  -1.51162027e-07   8.71703321e-07
   3.21286763e-07  -1.28828788e-07   1.39126257e-06  -3.77913238e-07
  -1.86078751e-07  -3.04741349e-07   1.25926715e-06  -6.70592840e-07
  -6.93869708e-07  -1.22939127e-06  -1.31133660e-07   2.85535236e-07
   1.78593507e-06  -7.31320483e-07  -5.98695806e-07   4.97694259e-07
  -1.38382450e-06  -2.93457258e-08   6.43191470e-07  -1.55329599e-06
  -2.27806254e-06   1.41107012e-06   2.47590356e-06   2.75408775e-06
   5.62041180e-07   3.36179227e-07  -2.85554425e-08   1.52786797e-07
   2.53999502e-07   3.30409762e-06   2.90849073e-07   3.56460475e-07
   5.93028858e-07   5.03058629e-07  -7.70375550e-07   9.61990281e-07
  -2.66904266e-07  -1.69946088e-06  -9.13650638e-07   2.11928423e-07
   8.26959990e-07   1.06914842e-06   9.22470008e-07  -2.17247589e-06
   1.42489625e-06   6.28112161e-07  -7.53763274e-08   7.00499015e-07
   5.99065800e-07   4.28337074e-07  -1.07303583e-06  -2.10397548e-06
  -2.36752771e-06   8.93263774e

In [14]:
rating_stats_df = model.rating_stats_df
item_bias_df = model.item_bias_df

filtered_item_factors_df2 = (
    filtered_item_factors_df
    .crossJoin(rating_stats_df)
    .join(item_bias_df, on='item')
    .withColumn(
        'orig_rating',
        F.col('rating')
    )
    .withColumn(
        'rating',
        F.col('rating')
        - F.col('avg_rating')
        - F.col('item_bias')
    )
)

(
    filtered_item_factors_df2
    .select(
        'item', 'user', 'rating', 'orig_prediction',
        'avg_rating', #'stddev_rating',
        'item_bias', 'avg_diffs_item_rating',
        'stderr_diffs_item_rating', 'stddev_diffs_item_rating',
        'count_item_rating', 'orig_rating'
    )
    .show(100, truncate=False)
)

+----+----+-------------------+------------------+------------------+----------------------+----------------------+------------------------+------------------------+-----------------+-----------+
|item|user|rating             |orig_prediction   |avg_rating        |item_bias             |avg_diffs_item_rating |stderr_diffs_item_rating|stddev_diffs_item_rating|count_item_rating|orig_rating|
+----+----+-------------------+------------------+------------------+----------------------+----------------------+------------------------+------------------------+-----------------+-----------+
|155 |3000|1.0765790671313908 |3.849672341016376 |3.8081435881372085|0.11527734473140061   |0.15244754486771733   |0.32244150160575175     |0.957187309111377       |609              |5          |
|115 |3000|-0.6409185064717741|3.5671699145522853|3.8081435881372085|-0.16722508166543446  |-0.2549520987755066   |0.5246044207984703      |1.0420334869674042      |235              |3          |
|192 |3000|1.6025848

In [20]:
filtered_item_factors = []
item_ratings = []
for row in filtered_item_factors_df2.collect():
    filtered_item_factors.append(row['features'])
    item_ratings.append(row['rating'])
filtered_item_factors = np.array(filtered_item_factors)
item_ratings = np.array(item_ratings)
print(filtered_item_factors.shape)
print(filtered_item_factors)
print(item_ratings.shape)
print(item_ratings)

(37, 76)
[[ 0.17072918  0.19680859  0.21965314 ...,  0.25315386  0.18137982
   0.19664668]
 [ 0.17004225  0.19632128  0.21882892 ...,  0.25262594  0.18086621
   0.19610749]
 [ 0.1702452   0.19617768  0.21901591 ...,  0.25231725  0.18081307
   0.19602866]
 ..., 
 [ 0.17055109  0.19631332  0.21936715 ...,  0.25242195  0.18098387
   0.19620164]
 [ 0.17201108  0.19791038  0.22122876 ...,  0.25444826  0.18247357
   0.1978122 ]
 [ 0.17415997  0.20037992  0.22399253 ...,  0.25762111  0.18475056
   0.20028046]]
(37,)
[ 5.36298233  5.45245724  6.74634579  3.97958128  4.20604889  2.37075211
  4.6622507   5.53515798  5.40685973  5.11824118  5.32137605  3.44466695
  3.09691901  5.01685809  3.50894662  6.00216361  5.11175627  4.49832273
  2.36172007  5.05172291  5.22307164  6.07871711  3.7280378   4.17301462
  4.58152686  3.87842346  6.2052313   4.47357016  5.90032127  5.23544264
  6.58630592  4.53662521  5.24434425  6.64344013  3.45982981  5.01527288
  4.40924559]


In [21]:
new_user_factors = np.dot(item_ratings, filtered_item_factors) / sum(item_ratings)
print(sum(item_ratings), item_ratings.mean())
print((new_user_factors / user_factors).mean())
# 35 * 3.5 ~ 122.6 # user 3000, sum(ratings) = 120, avg(ratings) = 3.4285714
# 40 * 3.5 ~ 142.4 # user 3001, sum(ratings) = 143, avg(ratings) = 3.575
# 33 * 4.2 ~ 138.3 # user 3002, sum(ratings) = 144, avg(ratings) = 4.3636
print(user_factors.shape)
print(user_factors)
print(new_user_factors.shape)
print(new_user_factors)
print(new_user_factors / user_factors)

177.627550163 4.80074459901
1.01670848543
(76,)
[ 0.16789205  0.19315493  0.21592818  0.22271433  0.22237928  0.3477492
  0.28929842  0.23317149  0.22982211  0.23681533  0.28274027  0.17150716
  0.27634874  0.19176431  0.18317878  0.30227867  0.24703541  0.19815147
  0.26319936  0.29353014  0.2363289   0.2156568   0.19762208  0.22113441
  0.24244733  0.24345616  0.23763084  0.27925789  0.16919948  0.22582635
  0.23148493  0.28613418  0.18429343  0.18181464  0.22680299  0.18386897
  0.18166584  0.23261586  0.23669659  0.26836577  0.21740308  0.22983143
  0.30741614  0.27158031  0.15876412  0.22398892  0.17326188  0.25610662
  0.21324217  0.20605992  0.25590393  0.28742862  0.26515552  0.20188807
  0.216563    0.2077571   0.22658667  0.30082989  0.19148758  0.2963239
  0.18150458  0.18148848  0.28001389  0.2405971   0.21947584  0.25990134
  0.24066807  0.24068025  0.26468825  0.15339871  0.24988359  0.27242273
  0.21852316  0.24832866  0.17809258  0.19306172]
(76,)
[ 0.17069177  0.196387

In [22]:
# make predictions for "new user"
item_factors = []
item_ids = []
for row in item_factors_df.collect():
    item_factors.append(row['features'])
    item_ids.append(row['id'])
item_factors = np.array(item_factors)
item_ids = np.array(item_ids)
print(item_factors.shape)
print(item_ids.shape)

new_predictions = np.dot(new_user_factors, item_factors.T)
print(new_predictions.shape)
print(new_predictions)

(5055, 76)
(5055,)
(5055,)
[ 4.31537276  4.33377624  4.31210298 ...,  4.56491196  4.18962354
  4.29849381]


In [23]:
new_prediction_df = spark.createDataFrame(zip(item_ids.tolist(), new_predictions.tolist()), ['item', 'prediction'])

In [24]:
new_prediction_df.count()

5055

In [26]:
new_predicted_rating_df = (
    new_prediction_df
    .crossJoin(avg_rating_df)
    .join(item_bias_df, on='item')
    .withColumn(
        'prediction',
        F.col('prediction')
        + F.col('avg_rating')
        + F.col('item_bias')
        - 5.0
    )
)

In [29]:
new_predicted_rating_df.registerTempTable('new_predicted_rating_df')
new_user_df.registerTempTable('new_user_df')
new_user_validate_df.registerTempTable('new_user_validate_df')

compare_df = spark.sql(
'''
select
    n.item, n.user, n.rating, n.orig_prediction, p.prediction, p.prediction - n.orig_prediction as diff,
    row_number() over (
        partition by n.user
        order by n.rating desc
    ) as actual_row_num,
    row_number() over (
        partition by n.user
        order by n.orig_prediction desc
    ) as orig_row_num,
    row_number() over (
        partition by n.user
        order by p.prediction desc
    ) as new_row_num
from new_user_df n
join new_predicted_rating_df p on n.item = p.item
order by new_row_num
'''
)

compare_validate_df = spark.sql(
'''
select
    n.item, n.user, n.rating, n.orig_prediction, p.prediction, p.prediction - n.orig_prediction as diff,
    row_number() over (
        partition by n.user
        order by n.rating desc
    ) as actual_row_num,
    row_number() over (
        partition by n.user
        order by n.orig_prediction desc
    ) as orig_row_num,
    row_number() over (
        partition by n.user
        order by p.prediction desc
    ) as new_row_num
from new_user_validate_df n
join new_predicted_rating_df p on n.item = p.item
order by new_row_num
'''
)

print(compare_df.show(100))
print(compare_validate_df.show(100))

+----+----+------+------------------+------------------+-------------------+--------------+------------+-----------+
|item|user|rating|   orig_prediction|        prediction|               diff|actual_row_num|orig_row_num|new_row_num|
+----+----+------+------------------+------------------+-------------------+--------------+------------+-----------+
| 480|3000|     4|3.5927897257681742|  3.81553432228821|0.22274459652003564|             8|           1|          1|
|1173|3000|     3| 3.235255175185257| 3.456985877071812|0.22173070188655508|            22|           2|          2|
|1408|3000|     5| 3.224184429480438| 3.446095890419633|0.22191146093919478|             1|           3|          3|
| 358|3000|     4| 3.159180354462533| 3.381953848089708| 0.2227734936271748|             9|           4|          4|
| 460|3000|     3|3.1488988573980254| 3.370878155473447|0.22197929807542138|            23|           5|          5|
|1277|3000|     4| 3.130123051173655|3.3523843730220673| 0.22226

In [33]:
discount_factor_df = (
    reviews_df
    .groupBy('item')
    .count()
    .select(
        F.col('item'),
        F.col('count').alias('num_ratings'),
        (1 - (1 / F.sqrt(F.col('count')))).alias('discount_factor')
    )
)

discount_factor_df.show(20)

+----+-----------+------------------+
|item|num_ratings|   discount_factor|
+----+-----------+------------------+
| 496|        273|0.9394772467331197|
| 148|        705|0.9623378211422645|
|1645|        148|0.9178005063473214|
|1959|         83|0.8902357400103097|
| 463|        361|0.9473684210526316|
| 833|        212|0.9313197180256555|
| 471|        381| 0.948768448042144|
|1342|        202|0.9296402455269708|
|1238|        154| 0.919417703597462|
|1829|        101|0.9004962809790011|
|1088|        161|0.9211889593760899|
|2366|         36|0.8333333333333334|
|2659|         40| 0.841886116991581|
|1591|         80|0.8881966011250105|
|1580|         77| 0.886039423540362|
|2866|         47|0.8541350085021054|
|2122|         66|0.8769085090206673|
|2142|         81|0.8888888888888888|
|3794|         27|0.8075499102701247|
|3997|         18|0.7642977396044841|
+----+-----------+------------------+
only showing top 20 rows

