In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import *
import numpy as np
from itertools import permutations
from pyspark.mllib.recommendation import ALS

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
small_ratings_raw_data = sc.textFile('/Users/gregcattell/PyProjects/data/PJ_data/ml-10M100K/ratings.csv')

small_ratings_data = small_ratings_raw_data.map(lambda line : line.split("::")).map(lambda tokens: (tokens[0],tokens[1],tokens[2]))\
.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))

In [5]:
small_movies_raw_data = sc.textFile('/Users/gregcattell/PyProjects/data/PJ_data/ml-10M100K/movies.csv')

small_movies_data = small_movies_raw_data.map(lambda line: line.split(",")).map(lambda tokens: tokens[0])\
.map(lambda line: line.split("::")).map(lambda x: (int(x[0]), x[1]))

In [6]:
small_ratings_data.take(3)

[(1, 122, 5.0), (1, 185, 5.0), (1, 231, 5.0)]

In [7]:
small_movies_data.take(3)

[(1, 'Toy Story (1995)'),
 (2, 'Jumanji (1995)'),
 (3, 'Grumpier Old Men (1995)')]

In [8]:
pre_train_RDD, pre_valid_RDD, pre_test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = pre_valid_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = pre_test_RDD.map(lambda x: (x[0], x[1]))

In [9]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(pre_train_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = pre_valid_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MAE = rates_and_preds.map(lambda r: abs(r[1][0] - r[1][1])).mean()
    RMSE = np.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = RMSE
    err += 1
    print('For rank', rank,'the MAE is: ', MAE,'the RMSE is: ', RMSE) 
    if RMSE < min_error:
        min_error = RMSE
        best_rank = rank

print('The best model was trained with rank: ', best_rank)

For rank 4 the MAE is:  0.6477914583162161 the RMSE is:  0.8305049650186825
For rank 8 the MAE is:  0.639625696720242 the RMSE is:  0.819310378075049
For rank 12 the MAE is:  0.6364687838721681 the RMSE is:  0.8153714189001765
The best model was trained with rank:  12


In [10]:
training_RDD, test_RDD = small_ratings_data.randomSplit([8, 2], seed=0)
test_user_unwatch = test_RDD.map(lambda x: (x[0], x[1]))

In [11]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_user_unwatch).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MAE = rates_and_preds.map(lambda r: abs(r[1][0] - r[1][1])).mean()
RMSE = np.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('For testing data the MAR is', MAE)
print('For testing data the RMSE is', RMSE)

For testing data the MAR is 0.6349936503753815
For testing data the RMSE is 0.8125630451607231


### prediction for one user = 2

In [124]:
index = 2
movie_set = small_movies_data.map(lambda x: x[0])
u_w_new = training_RDD.filter(lambda x: x[0] == index).map(lambda x: x[1])
u_u_new = movie_set.subtract(u_w_new).map(lambda x: (index, x))

In [125]:
u_u_new.take(3)

[(2, 10), (2, 20), (2, 30)]

In [126]:
u_prediction = model.predictAll(u_u_new).map(lambda r: ((r[0], r[1]), r[2]))\
.map(lambda x: (x[0][0], (x[0][1], x[1])))\
.top(10, key = lambda x: x[1][1])

In [127]:
Top_Ten = sc.parallelize(u_prediction).map(lambda x: (x[0], x[1][0]))\
.groupByKey().map(lambda x: (x[0], list(x[1])))

In [128]:
testlist = test_RDD.map(lambda r: (r[0], (r[1], r[2]))).filter(lambda r: r[0] == index)\
.sortBy(lambda x: x[1][1], ascending = False).map(lambda x: (x[0], x[1][0])).groupByKey()\
.map(lambda line: (line[0], list(line[1])))

In [129]:
true = testlist.map(lambda r: (r[0], set(r[1])))
recom = Top_Ten.map(lambda r: (r[0], set(r[1])))

comparison_com = true.join(recom)\
.map(lambda x: (len(x[1][0] & x[1][1]), len(x[1][1]),len(x[1][0])))\
.map(lambda x: (x[0]/x[1], x[0]/x[2])).collect()

F_measure = 2 * precision *recall/ (precision + recall)

In [130]:
print("precision for", index, comparison_com[0][0])
print("recall for", index, comparison_com[0][1])
print("F_measure:", index, F_measure)

precision for 2 0.1
recall for 2 0.16666666666666666
F_measure: 2 0.7376004004114871


In [131]:
def nDCG(test, prediction):
    DCG =0
    IDCG = 0
    j=1
    for i in range(len(test)):
        if test[i] in prediction:
            DCG+=1/np.log2(1+i+1)
            IDCG+=1/np.log2(j+1)
            j+=1
    if IDCG!=0:
        return DCG/IDCG
    else: return 0

In [132]:
test = testlist.flatMap(lambda r: r[1]).collect()
predic = Top_Ten.flatMap(lambda r: r[1]).collect()
comparison_ndcg = nDCG(test, predic)

In [133]:
comparison_ndcg

0.5

In [134]:
def l2tuple(u, line):
    tup = [(x, u) for x in line]
    return tup
trueMList = testlist.flatMap(lambda x: l2tuple(x[0], x[1]))
predMList = Top_Ten.flatMap(lambda x: l2tuple(x[0], x[1]))

In [137]:
TM = trueMList.join(small_movies_data).map(lambda x: (x[1][0], x[0], x[1][1]))

In [140]:
TM.collect()

[(2, 110, 'Braveheart (1995)'),
 (2, 590, 'Dances with Wolves (1990)'),
 (2, 1210, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (2, 802, 'Phenomenon (1996)'),
 (2, 736, 'Twister (1996)'),
 (2, 1356, 'Star Trek: First Contact (1996)')]

In [141]:
PM = predMList.join(small_movies_data).map(lambda x: (x[1][0], x[0], x[1][1]))

In [142]:
PM.collect()

[(2, 8120, '29th Street (1991)'),
 (2, 65133, 'Blackadder Back & Forth (1999)'),
 (2, 25975, 'Life of Oharu'),
 (2, 1196, 'Star Wars: Episode V - The Empire Strikes Back (1980)'),
 (2, 1210, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (2,
  1198,
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)'),
 (2, 32657, 'Man Who Planted Trees'),
 (2, 1787, 'Paralyzing Fear: The Story of Polio in America'),
 (2, 4993, 'Lord of the Rings: The Fellowship of the Ring'),
 (2, 7153, 'Lord of the Rings: The Return of the King')]

### test loop

In [117]:
def nDCG(test, prediction):
    DCG =0
    IDCG = 0
    j=1
    for p in range(len(test)):
        if test[p] in prediction:
            DCG+=1/np.log2(1+p+1)
            IDCG+=1/np.log2(j+1)
            j+=1
    if IDCG!=0:
        return DCG/IDCG
    else: return 0

In [121]:
precision = 0
recall = 0
comparison_ndcg = 0
iterations = 100

for i in range(iterations):
    index = i+1
    movie_set = small_movies_data.map(lambda x: x[0])
    u_w_new = training_RDD.filter(lambda x: x[0] == index).map(lambda x: x[1])
    u_u_new = movie_set.subtract(u_w_new).map(lambda x: (index, x))

    u_prediction = model.predictAll(u_u_new).map(lambda r: ((r[0], r[1]), r[2]))\
    .map(lambda x: (x[0][0], (x[0][1], x[1])))\
    .top(10, key = lambda x: x[1][1])

    Top_Ten = sc.parallelize(u_prediction).map(lambda x: (x[0], x[1][0]))\
    .groupByKey().map(lambda x: (x[0], list(x[1])))

    testlist = test_RDD.map(lambda r: (r[0], (r[1], r[2]))).filter(lambda r: r[0] == index)\
    .sortBy(lambda x: x[1][1], ascending = False).map(lambda x: (x[0], x[1][0])).groupByKey()\
    .map(lambda line: (line[0], list(line[1])))

    true = testlist.map(lambda r: (r[0], set(r[1])))
    recom = Top_Ten.map(lambda r: (r[0], set(r[1])))

    comparison_com = true.join(recom)\
    .map(lambda x: (len(x[1][0] & x[1][1]), len(x[1][1]),len(x[1][0])))\
    .map(lambda x: (x[0]/x[1], x[0]/x[2])).collect()
    
    if len(comparison_com)==0:
        continue
    if len(comparison_com)!=0:
        precision +=comparison_com[0][0]
        recall +=comparison_com[0][1]
 
    print("precision for", i+1, comparison_com[0][0])
    print("recall for", i+1, comparison_com[0][1])

    test = testlist.flatMap(lambda r: r[1]).collect()
    predic = Top_Ten.flatMap(lambda r: r[1]).collect()
    comparison_ndcg += nDCG(test, predic)

precision for 1 0.0
recall for 1 0.0
precision for 2 0.1
recall for 2 0.16666666666666666
precision for 3 0.0
recall for 3 0.0
precision for 4 0.0
recall for 4 0.0
precision for 5 0.0
recall for 5 0.0
precision for 6 0.0
recall for 6 0.0
precision for 7 0.0
recall for 7 0.0
precision for 8 0.0
recall for 8 0.0
precision for 9 0.0
recall for 9 0.0
precision for 10 0.0
recall for 10 0.0
precision for 11 0.0
recall for 11 0.0
precision for 12 0.0
recall for 12 0.0
precision for 13 0.0
recall for 13 0.0
precision for 14 0.0
recall for 14 0.0
precision for 16 0.0
recall for 16 0.0
precision for 17 0.0
recall for 17 0.0
precision for 18 0.0
recall for 18 0.0
precision for 19 0.0
recall for 19 0.0
precision for 22 0.0
recall for 22 0.0
precision for 23 0.0
recall for 23 0.0
precision for 24 0.0
recall for 24 0.0
precision for 26 0.0
recall for 26 0.0
precision for 27 0.0
recall for 27 0.0
precision for 28 0.0
recall for 28 0.0
precision for 29 0.0
recall for 29 0.0
precision for 30 0.0
recall

In [122]:
precision_mean = precision/iterations
recall_mean = recall/iterations
F_measure = 2 * precision *recall/ (precision + recall)
ndcg_mean = comparison_ndcg/iterations

In [123]:
print("precision: ", precision_mean)
print("recall: ", recall_mean)
print("F-measure: ", F_measure)
print("nDCG: ", ndcg_mean)

precision:  0.008
recall:  0.006842307447855284
F-measure:  0.7376004004114871
nDCG:  0.04293432703646809
