In [43]:
from recommender import Recommender
from pyspark.ml.evaluation import RegressionEvaluator
from eval_model import TopQuantileEvaluator, NDCGEvaluator, NDCG10Evaluator
from pyspark.sql import functions as F

In [44]:
rmse_evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction")

quant_evaluator = TopQuantileEvaluator()

ndcg_evaluator = NDCGEvaluator()

ndcg10_evaluator = NDCG10Evaluator()

In [40]:
# Load restaurant reviews
reviews_df = spark.read.parquet('../data/ratings_ugt10_igt10')

# Randomly split data into train and test datasets
train_df, test_df = reviews_df.randomSplit(weights=[0.75, 0.25])

print(train_df.printSchema())

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)

None


In [53]:
estimator = Recommender(
    useALS=True,
    useBias=True,
    lambda_1=7,
    lambda_2=12,
    userCol='user',
    itemCol='item',
    ratingCol='rating',
    rank=76,
    regParam=0.7,
    maxIter=15,
    nonnegative=True
)
model = estimator.fit(train_df)

train_predictions_df = model.transform(train_df)
predictions_df = model.transform(test_df)

print(predictions_df.printSchema())

train_predictions_df.registerTempTable("train_predictions_df")
predictions_df.registerTempTable("predictions_df")

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: byte (nullable = true)
 |-- prediction: double (nullable = true)

None


In [54]:
# print('rmse: ', rmse_evaluator.evaluate(predictions_df))
# print('quant: ', quant_evaluator.evaluate(predictions_df))
print('train ndcg: ', ndcg_evaluator.evaluate(train_predictions_df))
print('test ndcg: ', ndcg_evaluator.evaluate(predictions_df))
print('train ndcg10: ', ndcg10_evaluator.evaluate(train_predictions_df))
print('test ndcg10: ', ndcg10_evaluator.evaluate(predictions_df))

train ndcg:  0.042821942740194796
test ndcg:  0.03582585416572548
train ndcg10:  0.08353896455358423
test ndcg10:  0.043665789170521885


In [46]:
predictions_df.head(40)

[Row(user=148, item=1238, rating=4, prediction=3.3150130676480636),
 Row(user=148, item=623, rating=3, prediction=3.4296339710792374),
 Row(user=148, item=588, rating=5, prediction=3.2206168163144557),
 Row(user=148, item=3062, rating=4, prediction=2.774133942817688),
 Row(user=148, item=321, rating=4, prediction=3.20498255146401),
 Row(user=148, item=642, rating=4, prediction=3.0863614245593887),
 Row(user=148, item=939, rating=3, prediction=2.311904546767229),
 Row(user=148, item=210, rating=4, prediction=3.3664048779724496),
 Row(user=148, item=236, rating=5, prediction=3.7750855149844114),
 Row(user=148, item=1085, rating=4, prediction=2.887221045362643),
 Row(user=148, item=140, rating=5, prediction=3.652437360852982),
 Row(user=148, item=1311, rating=4, prediction=3.165601377220902),
 Row(user=148, item=715, rating=4, prediction=3.2322611294828576),
 Row(user=148, item=992, rating=5, prediction=3.5874774410188017),
 Row(user=148, item=699, rating=5, prediction=3.396021077389607),

In [48]:
predictions_df.groupBy('user').count().orderBy('count', ascending=False).head(10)

[Row(user=0, count=236),
 Row(user=1, count=179),
 Row(user=2, count=168),
 Row(user=3, count=142),
 Row(user=4, count=139),
 Row(user=12, count=114),
 Row(user=6, count=113),
 Row(user=7, count=113),
 Row(user=8, count=104),
 Row(user=10, count=98)]

In [49]:
df2a = spark.sql(
'''
    select
        user,
        sum(dcg) / sum(idcg) as ndcg
    from (
        select
            user,
            rating / log(2, 1 + 
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                )
            ) as dcg,
            rating / log(2, 1 + 
                row_number() OVER (
                        PARTITION BY user
                        ORDER BY rating DESC
                    )
            ) as idcg
        from predictions_df
    ) x
    group by user

'''
)


df2b = spark.sql(
'''
    select 
        p.user,
        p.dcg / a.idcg as ndcg10
    from (
        select
            x.user,
            sum(x.rating / log(2, 1 + x.pred_row_num)) as dcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                ) as pred_row_num
            from predictions_df
        ) x 
        where x.pred_row_num <= 10
        group by x.user
    ) p
    join (
        select
            x.user,
            sum(x.rating / log(2, 1 + x.actual_row_num)) as idcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY rating DESC
                ) as actual_row_num
            from predictions_df
        ) x 
        where x.actual_row_num <= 10
        group by x.user
    ) a on a.user = p.user
'''
)

print(df2a.show(10))
print(df2b.show(10))

+----+------------------+
|user|              ndcg|
+----+------------------+
| 148|0.9728784174019554|
| 463|0.9722430018359393|
| 471|0.9643870276112828|
| 496|0.9487168757638186|
| 833|0.9757471111120347|
|1088|0.9818244946677487|
|1238|0.9837089947755142|
|1342|0.8730689413392722|
|1580|0.9182753535720931|
|1591|0.9623163943935841|
+----+------------------+
only showing top 10 rows

None
+----+------------------+
|user|            ndcg10|
+----+------------------+
| 148|0.8552441410049546|
| 463|0.8998231837904668|
| 471|0.8904629554697352|
| 496|0.9022594525255184|
| 833|0.9424659505448238|
|1088|0.9586566196543211|
|1238|0.9281860059485352|
|1342| 0.740790364549004|
|1580|0.9182753535720931|
|1591|0.9158825646429439|
+----+------------------+
only showing top 10 rows

None


In [55]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(2, 1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(2, 1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from predictions_df
where user = 1088
order by pred_row_num
'''
)
df3.show(100)

+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|user|item|rating|        prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|1088| 658|     5|3.5237720644899895|                5.0|                5.0|           1|            1|
|1088| 399|     4|3.3947158620042472| 2.5237190142858297|                2.0|           2|            3|
|1088| 873|     5|3.3131666746353954|                2.5|  3.154648767857287|           3|            2|
|1088| 150|     4|3.2764287410539943| 1.7227062322935722| 1.7227062322935722|           4|            4|
|1088|   1|     3| 3.244822264374948| 1.1605584217036249| 0.7879486051115807|           5|           13|
|1088|  58|     4|3.1331424303603477| 1.4248287484320887| 1.5474112289381665|           6|            5|
|1088|  33|     4|3.0242493870936222| 1.333333333333333

In [31]:
toy_df = spark.createDataFrame([
    (1,1,1,3.8), (1,2,3,3.8), (1,3,1,3.8), (1,4,1,3.8), (1,5,5,3.8),
    (1,6,4,3.8), (1,7,5,3.8), (1,8,5,3.8), (1,9,5,3.8), (1,10,5,3.8),
],['user','item','rating', 'prediction'])

toy_df.registerTempTable("toy_df")

In [33]:
df3 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(2, 1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(2, 1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from toy_df
'''
)
df3.show(100)

+----+----+------+----------+-------------------+-------------------+------------+-------------+
|user|item|rating|prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+----------+-------------------+-------------------+------------+-------------+
|   1|   5|     5|       3.8|                5.0|                5.0|           1|            1|
|   1|   7|     5|       3.8|  3.154648767857287|  3.154648767857287|           2|            2|
|   1|   8|     5|       3.8|                2.5|                2.5|           3|            3|
|   1|   9|     5|       3.8| 2.1533827903669653| 2.1533827903669653|           4|            4|
|   1|  10|     5|       3.8|  1.934264036172708|  1.934264036172708|           5|            5|
|   1|   6|     4|       3.8| 1.4248287484320887| 1.4248287484320887|           6|            6|
|   1|   2|     3|       3.8|                1.0|                1.0|           7|            7|
|   1|   1|     1|       3.8| 

In [19]:
avg_rating_df = (
    train_df
    .agg(
        F.avg('rating').alias('avg_rating')
    )
)

train_predict_df = (
    train_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

train_predict_df.registerTempTable("train_predict_df")

train_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  18|     4| 4.205746043301845|
|   0|  22|     4|2.7445256104127926|
|   0|  32|     4| 4.983120720496773|
|   0|  34|     3|1.9849535130973968|
|   0|  35|     5|3.1966399651712267|
|   0|  36|     3| 3.048048898588427|
|   0|  50|     5|  3.90836757442086|
|   0|  62|     4|3.8165881878911465|
|   0|  70|     4| 4.047189576890129|
|   0|  74|     5|2.6096747504398348|
|   0|  78|     5|5.5969979073330896|
|   0|  98|     5| 5.153708334731422|
|   0| 116|     5| 3.998787456599543|
|   0| 134|     3|3.4128474009978653|
|   0| 136|     2|3.3225291797482783|
|   0| 149|     5|3.0166765191445477|
|   0| 161|     4|5.0159909166836005|
|   0| 198|     5|2.5904302401800092|
|   0| 217|     3|2.8331438417314114|
|   0| 222|     5|3.2404804322849694|
+----+----+------+------------------+
only showing top 20 rows



In [None]:
df4 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(2, 1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(2, 1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from train_predict_df
where user = 148
order by pred_row_num
'''
)
df4.show(20)

In [24]:
test_predict_df = (
    test_df
    .crossJoin(avg_rating_df)
    .withColumn(
        'prediction',
        F.col('avg_rating') + F.randn()
    )
    .select(
        'user',
        'item',
        'rating',
        'prediction'
    )
)

test_predict_df.registerTempTable("test_predict_df")

test_predict_df.show()

+----+----+------+------------------+
|user|item|rating|        prediction|
+----+----+------+------------------+
|   0|  43|     5| 3.997778279805047|
|   0|  77|     3| 2.749095691273272|
|   0| 106|     4|  2.98862019165168|
|   0| 146|     3|2.7637004759381076|
|   0| 157|     4| 4.760402944730159|
|   0| 188|     3| 3.750539876730257|
|   0| 190|     4|3.9008672499591897|
|   0| 236|     4|6.6272952593902765|
|   0| 356|     5| 4.048894507629259|
|   0| 366|     3|2.8464701861769406|
|   0| 403|     2| 4.013454484871694|
|   0| 408|     3|3.4838903594693447|
|   0| 464|     4| 3.675176549501538|
|   0| 533|     4| 5.499633001805129|
|   0| 650|     3| 2.952754801739461|
|   0| 713|     4| 3.829402855534937|
|   0| 715|     4| 4.171312244671753|
|   0| 785|     3| 5.251754628391863|
|   0| 831|     4| 2.787791198773814|
|   0| 883|     4|2.7472032621280515|
+----+----+------+------------------+
only showing top 20 rows



In [27]:
df5 = spark.sql(
'''
select
    user,
    item,
    rating,
    prediction,
    rating / log(2, 1 + 
        row_number() OVER (
            PARTITION BY user
            ORDER BY prediction DESC
        )
    ) as dcg,
    rating / log(2, 1 + 
        row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            )
    ) as idcg,
    row_number() OVER (
        PARTITION BY user
        ORDER BY prediction DESC
    ) as pred_row_num,
    row_number() OVER (
        PARTITION BY user
        ORDER BY rating DESC
    ) as ideal_row_num
from test_predict_df
where user = 148
'''
)
df5.show(200)

+----+----+------+------------------+-------------------+-------------------+------------+-------------+
|user|item|rating|        prediction|                dcg|               idcg|pred_row_num|ideal_row_num|
+----+----+------+------------------+-------------------+-------------------+------------+-------------+
| 148| 140|     5|  5.53996767641031|  3.154648767857287|                5.0|           2|            1|
| 148|2422|     5| 5.140437539702793|  1.934264036172708|  3.154648767857287|           5|            2|
| 148| 229|     5| 4.253518892461914|               1.25|                2.5|          15|            3|
| 148|2226|     5| 4.128073243369231| 1.1568910657987959| 2.1533827903669653|          19|            4|
| 148| 511|     5| 3.987186466592252|  1.138351243484765|  1.934264036172708|          20|            5|
| 148| 488|     5| 3.843091267052496| 1.0637302677668157| 1.7810359355401109|          25|            6|
| 148| 137|     5|3.6272064574936125|  1.01897523545253

In [30]:
df6 = spark.sql(
'''
select 1 - avg(p.dcg / a.idcg) as ndcg
from (
    select
        x.user,
        sum(x.rating / log(2, 1 + x.pred_row_num)) as dcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY prediction DESC
            ) as pred_row_num
        from predictions_df
    ) x 
    where x.pred_row_num <= 10
    group by x.user
) p
join (
    select
        x.user,
        sum(x.rating / log(2, 1 + x.actual_row_num)) as idcg
    from (
        select
            user,
            rating,
            row_number() OVER (
                PARTITION BY user
                ORDER BY rating DESC
            ) as actual_row_num
        from predictions_df
    ) x 
    where x.actual_row_num <= 10
    group by x.user
) a on a.user = p.user
''')

df6.collect()[0][0]

0.043437563021476056

In [13]:
# test top N ndcg implementation
def eval_ndcg(df):
    df.registerTempTable("df")
    
    score_df = spark.sql(
    '''
    select 1 - avg(p.dcg / a.idcg) as ndcg
    from (
        select
            x.user,
            sum(x.rating / log(2, 1 + x.pred_row_num)) as dcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY prediction DESC
                ) as pred_row_num
            from df
        ) x 
        where x.pred_row_num <= 10
        group by x.user
    ) p
    join (
        select
            x.user,
            sum(x.rating / log(2, 1 + x.actual_row_num)) as idcg
        from (
            select
                user,
                rating,
                row_number() OVER (
                    PARTITION BY user
                    ORDER BY rating DESC
                ) as actual_row_num
            from df
        ) x 
        where x.actual_row_num <= 10
        group by x.user
    ) a on a.user = p.user
    '''
    )
    
    return score_df.collect()[0][0]

In [14]:
print('train ndcg: ', ndcg_evaluator.evaluate(train_predictions_df))
print('test ndcg: ', ndcg_evaluator.evaluate(predictions_df))
print('train ndcg_10: ', eval_ndcg(train_predictions_df))
print('test ndcg_10: ', eval_ndcg(predictions_df))

train ndcg:  0.04283937609480404
test ndcg:  0.0357604036771193
train ndcg_10:  0.08370289899941341
test ndcg_10:  0.043437563021476056


In [15]:
print('random train ndcg: ', ndcg_evaluator.evaluate(train_predict_df))
print('random test ndcg: ', ndcg_evaluator.evaluate(test_predict_df))
print('random train ndcg_10: ', eval_ndcg(train_predict_df))
print('random test ndcg_10: ', eval_ndcg(test_predict_df))

NameError: name 'train_predict_df' is not defined