In [22]:
from recommender import Recommender
from pyspark.ml.evaluation import RegressionEvaluator
from eval_model import TopQuantileEvaluator, NDCGEvaluator
from pyspark.sql import functions as F

In [26]:
rmse_evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction")

quant_evaluator = TopQuantileEvaluator()

ndcg_evaluator = NDCGEvaluator()

In [5]:
# Load restaurant reviews
reviews_df = spark.read.parquet('../data/ratings_ugt10_igt10')

# Randomly split data into train and test datasets
train_df, test_df = reviews_df.randomSplit(weights=[0.75, 0.25])

print(train_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)

None


In [6]:
estimator = Recommender(
    useALS=True,
    useBias=True,
    lambda_1=10,
    lambda_2=15,
    userCol='user',
    itemCol='item',
    ratingCol='rating',
    rank=64,
    regParam=1,
    maxIter=10,
    nonnegative=True
)
model = estimator.fit(train_df)

predictions_df = model.transform(test_df)

print(predictions_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)
 |-- prediction: double (nullable = true)

None


In [27]:
# print('rmse: ', rmse_evaluator.evaluate(predictions_df))
# print('quant: ', quant_evaluator.evaluate(predictions_df))
print('train ndcg: ', ndcg_evaluator.evaluate(model.transform(train_df)))
print('test ndcg: ', ndcg_evaluator.evaluate(predictions_df))

train ndcg:  0.043035552284963585
test ndcg:  0.03527629154321843


In [21]:
predictions_df.head(40)

[Row(user=148, item=1238, rating=4, prediction=3.0721671951491594),
 Row(user=148, item=737, rating=3, prediction=2.727083029997713),
 Row(user=148, item=2027, rating=4, prediction=3.2679619443505388),
 Row(user=148, item=321, rating=4, prediction=2.9963406890569564),
 Row(user=148, item=1160, rating=4, prediction=2.9338471537292614),
 Row(user=148, item=155, rating=4, prediction=3.169726367238459),
 Row(user=148, item=368, rating=3, prediction=3.236486560454802),
 Row(user=148, item=939, rating=3, prediction=2.1408803235644607),
 Row(user=148, item=784, rating=4, prediction=2.914118032422511),
 Row(user=148, item=2678, rating=4, prediction=2.9181141687548333),
 Row(user=148, item=192, rating=4, prediction=2.4363321150391783),
 Row(user=148, item=973, rating=3, prediction=2.0869361684254004),
 Row(user=148, item=3673, rating=4, prediction=2.884033727332037),
 Row(user=148, item=1295, rating=3, prediction=2.2001822043387502),
 Row(user=148, item=1197, rating=3, prediction=2.659467214113

In [22]:
predictions_df.groupBy('user').count().head(10)

[Row(user=148, count=47),
 Row(user=463, count=25),
 Row(user=471, count=30),
 Row(user=496, count=24),
 Row(user=833, count=22),
 Row(user=1088, count=24),
 Row(user=1238, count=22),
 Row(user=1342, count=22),
 Row(user=1580, count=12),
 Row(user=1591, count=10)]

In [9]:
predictions_df.registerTempTable("predictions_df")

df2 = spark.sql(
'''

    select
        user,
        sum(dcg) / sum(idcg) as ndcg
    from (
        select
            user,
            rating / log(1 + 
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                )
            ) as dcg,
            rating / log(1 + 
                row_number() OVER (
                        PARTITION BY user
                        ORDER BY rating DESC
                    )
            ) as idcg
        from predictions_df
    ) x
    group by user

'''
)
df2.show(10)

+----+------------------+
|user|              ndcg|
+----+------------------+
| 148|0.9845238447729834|
| 463|0.9541466776932044|
| 471|0.9793995534308778|
| 496|0.9260704730859793|
| 833|0.9559826391220427|
|1088|0.9838440054122111|
|1238|0.9704361098941021|
|1342|0.9284520936553502|
|1580|0.9545193681635723|
|1591|0.9682687304631428|
+----+------------------+
only showing top 10 rows



In [29]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from predictions_df
where user = 500
order by pred_row_num
'''
)
df3.show(100)

+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|user|item|rating|        prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+-------------------+-------------------+------------+-------------+
| 500|3356|     3| 3.366303765944828|  4.328085122666891| 1.0014246020860023|           1|           19|
| 500|2064|     4|3.3008995942101453| 3.6409569065073493| 5.7707801635558535|           2|            1|
| 500|2290|     4| 3.284119200868174| 2.8853900817779268| 3.6409569065073493|           3|            2|
| 500| 523|     3| 3.132518469365383| 1.8640048036788355| 0.9853762162591532|           4|           20|
| 500|4246|     4|3.1215813143038034|  2.232442506204989| 2.8853900817779268|           5|            3|
| 500| 710|     3| 3.121093621442391| 1.5416950271092522| 0.9705463594460175|           6|           21|
| 500|2470|     4|3.1130178015461194| 1.923593387851951

In [27]:
test_df = spark.createDataFrame([
    (1,1,1,3.8), (1,2,3,3.8), (1,3,1,3.8), (1,4,1,3.8), (1,5,5,3.8),
    (1,6,4,3.8), (1,7,5,3.8), (1,8,5,3.8), (1,9,5,3.8), (1,10,5,3.8),
],['user','item','rating', 'prediction'])

In [28]:
test_df.registerTempTable("test_df")

In [80]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from test_df
'''
)
df3.show(100)

+----+----+------+----------+------------------+-------------------+------------+-------------+
|user|item|rating|prediction|               dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+----------+------------------+-------------------+------------+-------------+
|   1|   5|     5|       3.8|2.7905531327562363|  7.213475204444817|           5|            1|
|   1|   7|     5|       3.8|2.4044917348149393|  4.551196133134186|           7|            2|
|   1|   8|     5|       3.8| 2.275598066567093| 3.6067376022224087|           8|            3|
|   1|   9|     5|       3.8|2.1714724095162588| 3.1066746727980594|           9|            4|
|   1|  10|     5|       3.8|2.0851619571212314| 2.7905531327562363|          10|            5|
|   1|   6|     4|       3.8| 2.055593369479003|  2.055593369479003|           6|            6|
|   1|   2|     3|       3.8| 2.730717679880512| 1.4426950408889636|           2|            7|
|   1|   1|     1|       3.8|1.442695040

In [86]:
avg_rating_df = (
    train_df
    .agg(
        F.avg('rating').alias('avg_rating')
    )
)

train_predict_df = (
    train_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

train_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  18|     4| 4.280782035296607|
|   0|  32|     4| 5.018341557424282|
|   0|  34|     3| 2.570663987101499|
|   0|  35|     5|3.2542284823357686|
|   0|  36|     3|4.3557659046109345|
|   0|  50|     5| 2.454146205000773|
|   0|  70|     4|4.3427167225744485|
|   0|  74|     5| 4.800987100868459|
|   0|  77|     3|3.3895095081727287|
|   0|  78|     5|  3.18404813090901|
|   0| 106|     4|2.6156342222657676|
|   0| 134|     3| 3.638965216621915|
|   0| 149|     5| 4.211780638415755|
|   0| 188|     3|4.1336025647745185|
|   0| 198|     5|3.8907372803031803|
|   0| 222|     5| 3.460641084078236|
|   0| 243|     4| 4.283961448071681|
|   0| 266|     4|2.9538884389306554|
|   0| 320|     3|3.6182716708000773|
|   0| 335|     4|3.0812508718878164|
+----+----+------+------------------+
only showing top 20 rows



In [83]:
train_predict_df.registerTempTable("train_predict_df")

In [85]:
df4 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from train_predict_df
where user = 148
'''
)
df4.show(200)

+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|user|item|rating|        prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+-------------------+-------------------+------------+-------------+
| 148| 715|     4| 6.608469249106496| 5.7707801635558535| 1.1250656562509134|           1|           34|
| 148| 275|     4| 6.118769042313135| 3.6409569065073493| 0.9654525532371003|           2|           62|
| 148|1870|     4| 6.073451644268585| 2.8853900817779268| 0.8933882273643878|           3|           87|
| 148|2422|     5| 5.634066541732915| 3.1066746727980594| 1.5732899022206244|           4|           23|
| 148| 356|     4| 5.529620592513689|  2.232442506204989| 1.0389212086890862|           5|           46|
| 148| 621|     2|  5.44705480354615| 1.0277966847395015|0.40650564100781805|           6|          136|
| 148|1640|     4| 5.445461766206521| 1.923593387851951

In [91]:
test_predict_df = (
    test_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

test_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  22|     4|1.8911752080484638|
|   0|  43|     5| 3.370885260535769|
|   0|  62|     4|3.2417762445307794|
|   0|  98|     5| 4.126976777697115|
|   0| 116|     5|2.2516901666721294|
|   0| 136|     2|2.0765545230093405|
|   0| 146|     3|2.2838983406814464|
|   0| 157|     4| 4.410504827571022|
|   0| 161|     4| 4.053211823962677|
|   0| 190|     4| 2.533742600285059|
|   0| 217|     3|2.3829648692915297|
|   0| 230|     4| 4.068444024932465|
|   0| 235|     4| 4.753016074614321|
|   0| 236|     4|3.3227744238954475|
|   0| 294|     2| 4.565544934825032|
|   0| 355|     5| 5.008540112467532|
|   0| 408|     3| 4.018441851852969|
|   0| 457|     4| 4.223090003740084|
|   0| 464|     4|  4.22063100777659|
|   0| 588|     4|2.6427804527497223|
+----+----+------+------------------+
only showing top 20 rows



In [92]:
test_predict_df.registerTempTable("test_predict_df")

In [95]:
df5 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from test_predict_df
where user = 148
limit 10
'''
)
df5.show(200)

+----+----+------+------------------+------------------+------------------+------------+-------------+
|user|item|rating|        prediction|               dcg|              idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+------------------+------------------+------------+-------------+
| 148|1945|     3| 5.683571195502512| 4.328085122666891|0.8247227332274807|           1|           37|
| 148| 368|     3|5.5319662160671585| 2.730717679880512|0.8026391592571619|           2|           41|
| 148|1029|     4| 5.263113853906219|2.8853900817779268|1.2586319217764994|           3|           23|
| 148|1183|     3| 5.093022498588467|1.8640048036788355|0.8371659398268709|           4|           35|
| 148|  25|     5|5.0367229909193405|2.7905531327562363|2.4044917348149393|           5|            7|
| 148|  62|     4| 5.027888197968194| 2.055593369479003| 1.275715955615204|           6|           22|
| 148|1342|     3| 5.026672578649384|1.4426950408889636|0.865617024533378

In [24]:
print('random train ndcg: ', ndcg_evaluator.evaluate(train_predict_df))
print('random test ndcg: ', ndcg_evaluator.evaluate(test_predict_df))

NameError: name 'train_predict_df' is not defined

In [21]:
df6 = spark.sql(
'''
select 1 - avg(p.dcg / a.idcg) as ndcg
from (
    select
        x.user,
        sum(x.rating / log(1 + x.pred_row_num)) as dcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY prediction DESC
            ) as pred_row_num
        from predictions_df
    ) x 
    where x.pred_row_num <= 10
    group by x.user
) p
join (
    select
        x.user,
        sum(x.rating / log(1 + x.actual_row_num)) as idcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            ) as actual_row_num
        from predictions_df
    ) x 
    where x.actual_row_num <= 10
    group by x.user
) a on a.user = p.user
''')

df6.collect()[0][0]

0.04299683846813529