In [22]:
from recommender import Recommender
from pyspark.ml.evaluation import RegressionEvaluator
from eval_model import TopQuantileEvaluator, NDCGEvaluator
from pyspark.sql import functions as F

In [26]:
rmse_evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction")

quant_evaluator = TopQuantileEvaluator()

ndcg_evaluator = NDCGEvaluator()

In [5]:
# Load restaurant reviews
reviews_df = spark.read.parquet('../data/ratings_ugt10_igt10')

# Randomly split data into train and test datasets
train_df, test_df = reviews_df.randomSplit(weights=[0.75, 0.25])

print(train_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)

None


In [35]:
estimator = Recommender(
    useALS=True,
    useBias=True,
    lambda_1=10,
    lambda_2=15,
    userCol='user',
    itemCol='item',
    ratingCol='rating',
    rank=64,
    regParam=1,
    maxIter=10,
    nonnegative=True
)
model = estimator.fit(train_df)

train_predictions_df = model.transform(train_df)
predictions_df = model.transform(test_df)

print(predictions_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)
 |-- prediction: double (nullable = true)

None


In [37]:
# print('rmse: ', rmse_evaluator.evaluate(predictions_df))
# print('quant: ', quant_evaluator.evaluate(predictions_df))
print('train ndcg: ', ndcg_evaluator.evaluate(train_predictions_df))
print('test ndcg: ', ndcg_evaluator.evaluate(predictions_df))

train ndcg:  0.043035552284963474
test ndcg:  0.03527629154321843


In [21]:
predictions_df.head(40)

[Row(user=148, item=1238, rating=4, prediction=3.0721671951491594),
 Row(user=148, item=737, rating=3, prediction=2.727083029997713),
 Row(user=148, item=2027, rating=4, prediction=3.2679619443505388),
 Row(user=148, item=321, rating=4, prediction=2.9963406890569564),
 Row(user=148, item=1160, rating=4, prediction=2.9338471537292614),
 Row(user=148, item=155, rating=4, prediction=3.169726367238459),
 Row(user=148, item=368, rating=3, prediction=3.236486560454802),
 Row(user=148, item=939, rating=3, prediction=2.1408803235644607),
 Row(user=148, item=784, rating=4, prediction=2.914118032422511),
 Row(user=148, item=2678, rating=4, prediction=2.9181141687548333),
 Row(user=148, item=192, rating=4, prediction=2.4363321150391783),
 Row(user=148, item=973, rating=3, prediction=2.0869361684254004),
 Row(user=148, item=3673, rating=4, prediction=2.884033727332037),
 Row(user=148, item=1295, rating=3, prediction=2.2001822043387502),
 Row(user=148, item=1197, rating=3, prediction=2.659467214113

In [22]:
predictions_df.groupBy('user').count().head(10)

[Row(user=148, count=47),
 Row(user=463, count=25),
 Row(user=471, count=30),
 Row(user=496, count=24),
 Row(user=833, count=22),
 Row(user=1088, count=24),
 Row(user=1238, count=22),
 Row(user=1342, count=22),
 Row(user=1580, count=12),
 Row(user=1591, count=10)]

In [38]:
predictions_df.registerTempTable("predictions_df")

df2 = spark.sql(
'''

    select
        user,
        sum(dcg) / sum(idcg) as ndcg
    from (
        select
            user,
            rating / log(1 + 
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                )
            ) as dcg,
            rating / log(1 + 
                row_number() OVER (
                        PARTITION BY user
                        ORDER BY rating DESC
                    )
            ) as idcg
        from predictions_df
    ) x
    group by user

'''
)
df2.show(10)

+----+------------------+
|user|              ndcg|
+----+------------------+
| 148| 0.984523844772983|
| 463|0.9541466776932048|
| 471|0.9793995534308781|
| 496|0.9260704730859796|
| 833| 0.955982639122043|
|1088|0.9838440054122111|
|1238|0.9704361098941022|
|1342|0.9284520936553509|
|1580|0.9545193681635726|
|1591|0.9682687304631423|
+----+------------------+
only showing top 10 rows



In [43]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from predictions_df
where user = 496
order by pred_row_num
'''
)
df3.show(100)

+----+----+------+------------------+------------------+------------------+------------+-------------+
|user|item|rating|        prediction|               dcg|              idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+------------------+------------------+------------+-------------+
| 496| 523|     3|2.8998835116373876| 4.328085122666891|1.0379287687835808|           1|           17|
| 496| 239|     4|2.7324467879128322|3.6409569065073493|1.9235933878519513|           2|            7|
| 496|1459|     5|2.6713102176594115|3.6067376022224087|3.6067376022224087|           3|            3|
| 496|  52|     4| 2.666420410800084|2.4853397382384474|2.4853397382384474|           4|            4|
| 496|  68|     3|2.6417538596585164|1.6743318796537419|0.9705463594460175|           5|           21|
| 496| 191|     3|2.6143576624590734|1.5416950271092522|1.0820212806667227|           6|           15|
| 496|1189|     3| 2.613882856208164|1.4426950408889636|1.058868371594283

In [27]:
test_df = spark.createDataFrame([
    (1,1,1,3.8), (1,2,3,3.8), (1,3,1,3.8), (1,4,1,3.8), (1,5,5,3.8),
    (1,6,4,3.8), (1,7,5,3.8), (1,8,5,3.8), (1,9,5,3.8), (1,10,5,3.8),
],['user','item','rating', 'prediction'])

In [28]:
test_df.registerTempTable("test_df")

In [80]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from test_df
'''
)
df3.show(100)

+----+----+------+----------+------------------+-------------------+------------+-------------+
|user|item|rating|prediction|               dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+----------+------------------+-------------------+------------+-------------+
|   1|   5|     5|       3.8|2.7905531327562363|  7.213475204444817|           5|            1|
|   1|   7|     5|       3.8|2.4044917348149393|  4.551196133134186|           7|            2|
|   1|   8|     5|       3.8| 2.275598066567093| 3.6067376022224087|           8|            3|
|   1|   9|     5|       3.8|2.1714724095162588| 3.1066746727980594|           9|            4|
|   1|  10|     5|       3.8|2.0851619571212314| 2.7905531327562363|          10|            5|
|   1|   6|     4|       3.8| 2.055593369479003|  2.055593369479003|           6|            6|
|   1|   2|     3|       3.8| 2.730717679880512| 1.4426950408889636|           2|            7|
|   1|   1|     1|       3.8|1.442695040

In [44]:
avg_rating_df = (
    train_df
    .agg(
        F.avg('rating').alias('avg_rating')
    )
)

train_predict_df = (
    train_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

train_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  18|     4| 2.845251225832589|
|   0|  22|     4|3.7780841942350936|
|   0|  32|     4|3.5015955248484967|
|   0|  34|     3| 2.539410315736408|
|   0|  35|     5| 2.989612054697336|
|   0|  36|     3|2.9835207766459804|
|   0|  50|     5| 4.751427589064557|
|   0|  62|     4|4.3471063407585255|
|   0|  74|     5| 4.236181802869641|
|   0|  77|     3|2.9774304554658917|
|   0|  78|     5| 4.032247795671046|
|   0|  98|     5| 3.053165060021244|
|   0| 106|     4| 5.444505505285591|
|   0| 116|     5|3.7787341474863587|
|   0| 134|     3| 2.606263050505312|
|   0| 136|     2| 4.068880120395524|
|   0| 146|     3| 4.606829789836579|
|   0| 149|     5| 4.107445297497457|
|   0| 157|     4|3.4871064842359045|
|   0| 161|     4|  5.51766677232951|
+----+----+------+------------------+
only showing top 20 rows



In [45]:
train_predict_df.registerTempTable("train_predict_df")

In [46]:
df4 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from train_predict_df
where user = 148
'''
)
df4.show(200)

+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|user|item|rating|        prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+-------------------+-------------------+------------+-------------+
| 148|1009|     4| 6.043736832080144| 5.7707801635558535| 1.1439986700106974|           1|           32|
| 148| 820|     3| 5.747410210612879|  2.730717679880512| 0.6700411705232908|           2|           87|
| 148|  46|     4| 5.714872401741406| 2.8853900817779268| 1.0277966847395015|           3|           48|
| 148| 210|     4| 5.695302923644237| 2.4853397382384474|  0.926464623598867|           4|           74|
| 148| 236|     5| 5.629216907061604| 2.7905531327562363| 1.7298812813059679|           5|           17|
| 148|  23|     1| 5.626099531972864| 0.5138983423697507|0.20544281918875867|           6|          129|
| 148|2529|     3| 5.370609111866672| 1.442695040888963

In [47]:
test_predict_df = (
    test_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

test_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  43|     5| 3.947406293572324|
|   0|  70|     4| 3.984101854285321|
|   0| 190|     4|3.8266331837719907|
|   0| 198|     5| 4.169422714751705|
|   0| 230|     4| 5.659353184495515|
|   0| 350|     5|3.9391224853297873|
|   0| 366|     3| 4.967832435353692|
|   0| 403|     2|4.8343555473009925|
|   0| 408|     3|2.5793599087928234|
|   0| 434|     4| 4.263477718785684|
|   0| 588|     4|  5.50305505239085|
|   0| 831|     4|3.1994677650259304|
|   0| 836|     4| 4.751692706438475|
|   0| 873|     4|  4.22214242246994|
|   0| 935|     2|  2.65093223378976|
|   0| 968|     2| 2.969307810704048|
|   0|1214|     4|   4.2903001340233|
|   0|1410|     2| 4.266772021599628|
|   0|1870|     4| 5.745538082044348|
|   0|1872|     4|1.9280815070081077|
+----+----+------+------------------+
only showing top 20 rows



In [48]:
test_predict_df.registerTempTable("test_predict_df")

In [49]:
df5 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from test_predict_df
where user = 148
limit 10
'''
)
df5.show(200)

+----+----+------+------------------+------------------+------------------+------------+-------------+
|user|item|rating|        prediction|               dcg|              idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+------------------+------------------+------------+-------------+
| 148| 172|     5|  4.94477052299465|3.1066746727980594| 7.213475204444817|           4|            1|
| 148|2805|     5|4.6683549089478635|2.0851619571212314| 4.551196133134186|          10|            2|
| 148|2226|     5| 4.589545345464705| 2.012148021909223|3.6067376022224087|          11|            3|
| 148|   4|     5| 4.520624541846984|1.9493562262564004|3.1066746727980594|          12|            4|
| 148| 511|     5|  4.49319650153454| 1.894615908449756|2.7905531327562363|          13|            5|
| 148| 935|     5| 4.454960668087284|1.8033688011112043|2.5694917118487535|          15|            6|
| 148|1113|     5| 4.101132611689691|1.6175772657433625|2.404491734814939

In [51]:
df6 = spark.sql(
'''
select 1 - avg(p.dcg / a.idcg) as ndcg
from (
    select
        x.user,
        sum(x.rating / log(1 + x.pred_row_num)) as dcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY prediction DESC
            ) as pred_row_num
        from predictions_df
    ) x 
    where x.pred_row_num <= 10
    group by x.user
) p
join (
    select
        x.user,
        sum(x.rating / log(1 + x.actual_row_num)) as idcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            ) as actual_row_num
        from predictions_df
    ) x 
    where x.actual_row_num <= 10
    group by x.user
) a on a.user = p.user
''')

df6.collect()[0][0]

0.06682208677101731

In [63]:
# test top N ndcg implementation
def eval_ndcg(df):
    df.registerTempTable("df")
    
    score_df = spark.sql(
    '''
    select 1 - avg(p.dcg / a.idcg) as ndcg
    from (
        select
            x.user,
            sum(x.rating / log(1 + x.pred_row_num)) as dcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                ) as pred_row_num
            from df
        ) x 
        where x.pred_row_num <= 5
        group by x.user
    ) p
    join (
        select
            x.user,
            sum(x.rating / log(1 + x.actual_row_num)) as idcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY rating DESC
                ) as actual_row_num
            from df
        ) x 
        where x.actual_row_num <= 5
        group by x.user
    ) a on a.user = p.user
    '''
    )
    
    return score_df.collect()[0][0]

In [64]:
print('random train ndcg: ', ndcg_evaluator.evaluate(train_predict_df))
print('random test ndcg: ', ndcg_evaluator.evaluate(test_predict_df))
print('random train ndcg_10: ', eval_ndcg(train_predict_df))
print('random test ndcg_10: ', eval_ndcg(test_predict_df))

random train ndcg:  0.07128057275355115
random test ndcg:  0.054524266291082535
random train ndcg_10:  0.1935072521480279
random test ndcg_10:  0.09208170274153493
