In [1]:
import findspark
findspark.init()

from pyspark.mllib.recommendation import ALS, Rating
from pyspark import SparkContext, SQLContext
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)

In [2]:
TRAIN_FILE = "./data//ratings-train.dat"
VALIDATION_FILE = "./data//ratings-validation.dat"
TEST_FILE = "./data/ratings-test.dat"

In [3]:
def prepare_data(data):
    return (
        data
        .map(lambda l: l.split(','))
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    )  

In [4]:
# Load and parse the data
ratings_train_text = sc.textFile(TRAIN_FILE)
ratings_train = prepare_data(ratings_train_text)

In [5]:
ratings_validation_text = sc.textFile(VALIDATION_FILE)
ratings_validation = prepare_data(ratings_validation_text)

In [6]:
ratings_test_text = sc.textFile(TEST_FILE)
ratings_test = prepare_data(ratings_validation_text)

#### Calculate the general mean u for all ratings

In [7]:
global_mean = ratings_train.map(lambda r: (r[2])).mean()

In [8]:
global_mean

3.54363466666664

##### calculate item-specific bias, according to the paper we referenced, for each item i, its bias is equal to the summation of difference between all ratings of to the same item and global mean and then the result is divided by the sum of a regulation parameter and the quantity of the ratings.


In [11]:
df = sqlContext.createDataFrame(ratings_train, ['userId', 'movieId', 'ratings'])

In [141]:
df.take(20)

[Row(userId=36955, movieId=21, ratings=3.0),
 Row(userId=36955, movieId=47, ratings=5.0),
 Row(userId=36955, movieId=1079, ratings=3.0),
 Row(userId=35139, movieId=1, ratings=4.0),
 Row(userId=35139, movieId=10, ratings=4.0),
 Row(userId=35139, movieId=18, ratings=4.0),
 Row(userId=35139, movieId=19, ratings=4.0),
 Row(userId=35139, movieId=21, ratings=5.0),
 Row(userId=35139, movieId=31, ratings=5.0),
 Row(userId=35139, movieId=32, ratings=5.0),
 Row(userId=35139, movieId=39, ratings=5.0),
 Row(userId=35139, movieId=45, ratings=4.0),
 Row(userId=35139, movieId=47, ratings=5.0),
 Row(userId=35139, movieId=50, ratings=5.0),
 Row(userId=35139, movieId=52, ratings=4.0),
 Row(userId=35139, movieId=55, ratings=4.0),
 Row(userId=35139, movieId=58, ratings=5.0),
 Row(userId=35241, movieId=32, ratings=5.0),
 Row(userId=35435, movieId=10, ratings=3.0),
 Row(userId=35435, movieId=11, ratings=4.0)]

In [12]:
df_orderByMovie = df.orderBy(df.movieId)

In [13]:
df_orderByMovie.take(5)

[Row(userId=35139, movieId=1, ratings=4.0),
 Row(userId=36008, movieId=1, ratings=5.0),
 Row(userId=36202, movieId=1, ratings=4.0),
 Row(userId=36304, movieId=1, ratings=5.0),
 Row(userId=36398, movieId=1, ratings=5.0)]

In [29]:
movie_count = df_orderByMovie.groupBy(df_orderByMovie.movieId).count()

In [60]:
movie_count.count()

7185

In [32]:
sum_byMovie = df_orderByMovie.groupBy(['movieId']).sum()

In [45]:
drop_column1 = sum_byMovie.drop(sum_byMovie[1])

In [46]:
final_drop = drop_column1.drop(drop_column1[1])

In [47]:
final_drop.first()

Row(movieId=1, sum(ratings)=71453.5)

In [66]:
movie_sorted = movie_count.join(final_drop, "movieId")

In [67]:
movie_sorted.take(5)

[Row(movieId=31, count=3762, sum(ratings)=12587.0),
 Row(movieId=231, count=11945, sum(ratings)=34902.5),
 Row(movieId=431, count=3371, sum(ratings)=12301.0),
 Row(movieId=631, count=1100, sum(ratings)=3103.0),
 Row(movieId=831, count=130, sum(ratings)=464.0)]

In [68]:
new_movie_sorted = movie_sorted.orderBy(movie_sorted.movieId)

In [69]:
new_movie_sorted.take(5)

[Row(movieId=1, count=17753, sum(ratings)=71453.5),
 Row(movieId=2, count=7961, sum(ratings)=26776.0),
 Row(movieId=3, count=6093, sum(ratings)=19760.5),
 Row(movieId=4, count=1626, sum(ratings)=4711.0),
 Row(movieId=5, count=5841, sum(ratings)=18490.5)]

In [99]:
item_bias = new_movie_sorted.map(lambda r: [r[0], (r[2] - r[1]*global_mean)/(25+r[1])])

In [100]:
new_item_bias = sqlContext.createDataFrame(item_bias, ['movieId', 'item_bias'])

In [101]:
new_item_bias.take(10)

[Row(movieId=1, item_bias=0.4805576421794993),
 Row(movieId=2, item_bias=-0.17967387695130513),
 Row(movieId=3, item_bias=-0.2992589120627393),
 Row(movieId=4, item_bias=-0.63655358449422),
 Row(movieId=5, item_bias=-0.3763842632117019),
 Row(movieId=6, item_bias=0.24531800591436403),
 Row(movieId=7, item_bias=-0.09761621267645276),
 Row(movieId=8, item_bias=-0.34720415541919714),
 Row(movieId=9, item_bias=-0.49023928514053583),
 Row(movieId=10, item_bias=-0.08865719185621093)]

In [87]:
item_bias.count()

7185

In [75]:
#calculate for user-specific bias
df_orderByUser = df.orderBy(df.userId)

In [102]:
contain_itemBias = df_orderByUser.join(new_item_bias, "movieId")

In [103]:
contain_itemBias.take(10)

[Row(movieId=31, userId=13, ratings=2.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=19, ratings=4.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=89, ratings=4.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=116, ratings=4.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=182, ratings=3.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=193, ratings=3.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=203, ratings=4.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=208, ratings=5.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=223, ratings=5.0, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=349, ratings=3.0, item_bias=-0.19650214312117759)]

In [106]:
sorted_byUser = contain_itemBias.orderBy(['userId'])

In [107]:
sorted_byUser.take(10)

[Row(movieId=231, userId=1, ratings=5.0, item_bias=-0.620402346978531),
 Row(movieId=466, userId=1, ratings=5.0, item_bias=-0.6062895858493179),
 Row(movieId=480, userId=1, ratings=5.0, item_bias=0.21433651876141116),
 Row(movieId=292, userId=1, ratings=5.0, item_bias=-0.05977773083298789),
 Row(movieId=316, userId=1, ratings=5.0, item_bias=-0.18991582589441186),
 Row(movieId=520, userId=1, ratings=5.0, item_bias=-0.5697394061384459),
 Row(movieId=122, userId=1, ratings=5.0, item_bias=-0.6393518769804023),
 Row(movieId=329, userId=1, ratings=5.0, item_bias=-0.16303700816343833),
 Row(movieId=539, userId=1, ratings=5.0, item_bias=0.09863419670758647),
 Row(movieId=355, userId=1, ratings=5.0, item_bias=-0.9120664840472149)]

In [108]:
subtraction = sorted_byUser.map(lambda r: [r[1], r[2] - global_mean - r[3]])

In [109]:
subtraction.take(10)

[[1, 2.076767680311891],
 [1, 2.0626549191826777],
 [1, 1.2420288145719487],
 [1, 1.5161430641663478],
 [1, 1.6462811592277717],
 [1, 2.0261047394718057],
 [1, 2.0957172103137625],
 [1, 1.6194023414967982],
 [1, 1.3577311366257734],
 [1, 2.368431817380575]]

In [110]:
user_bias_part1 = sqlContext.createDataFrame(subtraction, ['userId', 'subtraction'])

In [111]:
sum_byUser = user_bias_part1.groupBy(['userId']).sum()

In [112]:
sum_byUser.take(10)

[Row(userId=231, sum(userId)=5544, sum(subtraction)=4.943156344255966),
 Row(userId=631, sum(userId)=20192, sum(subtraction)=7.908224109144056),
 Row(userId=831, sum(userId)=83100, sum(subtraction)=64.85731076719358),
 Row(userId=1031, sum(userId)=21651, sum(subtraction)=18.112618328424194),
 Row(userId=1431, sum(userId)=231822, sum(subtraction)=-11.208059875726343),
 Row(userId=1631, sum(userId)=32620, sum(subtraction)=16.282610671923322),
 Row(userId=1831, sum(userId)=64085, sum(subtraction)=18.16192419338471),
 Row(userId=2031, sum(userId)=50775, sum(subtraction)=-2.2135119819671107),
 Row(userId=2231, sum(userId)=73623, sum(subtraction)=2.0335448169743335),
 Row(userId=2431, sum(userId)=82654, sum(subtraction)=-3.7197108062734827)]

In [117]:
sum_UserCollect = user_bias_part1.groupBy(['userId']).count()

In [119]:
ordered_sum_UserCollect = sum_UserCollect.orderBy(sum_UserCollect.userId)

In [113]:
drop_column2 = sum_byUser.drop(sum_byUser[1])

In [115]:
final_drop2 = drop_column2.orderBy(drop_column2.userId)

In [116]:
final_drop2.take(10)

[Row(userId=1, sum(subtraction)=35.830251689909204),
 Row(userId=2, sum(subtraction)=-8.371114754373947),
 Row(userId=4, sum(subtraction)=21.89887419075155),
 Row(userId=5, sum(subtraction)=-1.7013075020805646),
 Row(userId=6, sum(subtraction)=11.452380076284523),
 Row(userId=7, sum(subtraction)=2.175563299262799),
 Row(userId=10, sum(subtraction)=5.182024569308008),
 Row(userId=11, sum(subtraction)=38.34927440015343),
 Row(userId=12, sum(subtraction)=4.231105047706301),
 Row(userId=13, sum(subtraction)=-6.294511936141549)]

In [120]:
user_bias_table = final_drop2.join(ordered_sum_UserCollect, 'userId')

In [121]:
user_bias_table.take(5)

[Row(userId=231, sum(subtraction)=4.943156344255966, count=24),
 Row(userId=631, sum(subtraction)=7.908224109144056, count=32),
 Row(userId=831, sum(subtraction)=64.85731076719358, count=100),
 Row(userId=1031, sum(subtraction)=18.112618328424194, count=21),
 Row(userId=1431, sum(subtraction)=-11.208059875726343, count=162)]

In [122]:
ordered_userBiaTable = user_bias_table.orderBy(user_bias_table.userId)

In [123]:
user_bias = ordered_userBiaTable.map(lambda r: [r[0], r[1]/(10+r[2])])

In [124]:
user_bias.take(5)

[[1, 1.1196953653096626],
 [2, -0.2790371584791316],
 [4, 0.4562265456406573],
 [5, -0.017539252598768706],
 [6, 0.220238078390087]]

In [131]:
user_specific_bias = sqlContext.createDataFrame(user_bias, ['userId', 'user_bias'])

In [132]:
user_specific_bias.take(10)

[Row(userId=1, user_bias=1.1196953653096626),
 Row(userId=2, user_bias=-0.2790371584791316),
 Row(userId=4, user_bias=0.4562265456406573),
 Row(userId=5, user_bias=-0.017539252598768706),
 Row(userId=6, user_bias=0.220238078390087),
 Row(userId=7, user_bias=0.01828204453162016),
 Row(userId=10, user_bias=0.038962590746676753),
 Row(userId=11, user_bias=0.4916573641045312),
 Row(userId=12, user_bias=0.06611101637041095),
 Row(userId=13, user_bias=-0.04034943548808685)]

In [127]:
df_orderByUser.take(10)

[Row(userId=1, movieId=588, ratings=5.0),
 Row(userId=1, movieId=231, ratings=5.0),
 Row(userId=1, movieId=316, ratings=5.0),
 Row(userId=1, movieId=329, ratings=5.0),
 Row(userId=1, movieId=292, ratings=5.0),
 Row(userId=1, movieId=185, ratings=5.0),
 Row(userId=1, movieId=356, ratings=5.0),
 Row(userId=1, movieId=480, ratings=5.0),
 Row(userId=1, movieId=364, ratings=5.0),
 Row(userId=1, movieId=589, ratings=5.0)]

In [133]:
merge1 = df_orderByUser.join(user_specific_bias, 'userId')

In [134]:
merge1.take(5)

[Row(userId=231, movieId=135, ratings=3.0, user_bias=0.14538695130164606),
 Row(userId=231, movieId=1721, ratings=2.0, user_bias=0.14538695130164606),
 Row(userId=231, movieId=1682, ratings=4.0, user_bias=0.14538695130164606),
 Row(userId=231, movieId=3552, ratings=5.0, user_bias=0.14538695130164606),
 Row(userId=231, movieId=4132, ratings=1.0, user_bias=0.14538695130164606)]

In [135]:
merge2 = merge1.join(new_item_bias, 'movieId')

In [136]:
merge2.take(5)

[Row(movieId=31, userId=3631, ratings=3.0, user_bias=0.1723984230273186, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=4031, ratings=4.0, user_bias=0.3052134317006871, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=5431, ratings=3.0, user_bias=0.15800853430422251, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=7831, ratings=3.0, user_bias=-0.36994032888141914, item_bias=-0.19650214312117759),
 Row(movieId=31, userId=8031, ratings=3.0, user_bias=-0.38041007979314, item_bias=-0.19650214312117759)]

In [137]:
new_ratings_train = merge2.map(lambda r: [r[0], r[1], r[2] - r[3] - r[4]])

In [138]:
temp = sqlContext.createDataFrame(new_ratings_train, ['movieId', 'userId', 'new_ratings'])

In [139]:
final_new_ratings_train = temp.orderBy(temp.userId)

In [140]:
final_new_ratings_train.take(10)

[Row(movieId=231, userId=1, new_ratings=4.500706981668868),
 Row(movieId=466, userId=1, new_ratings=4.486594220539655),
 Row(movieId=480, userId=1, new_ratings=3.6659681159289264),
 Row(movieId=292, userId=1, new_ratings=3.9400823655233252),
 Row(movieId=316, userId=1, new_ratings=4.070220460584749),
 Row(movieId=520, userId=1, new_ratings=4.450044040828783),
 Row(movieId=122, userId=1, new_ratings=4.51965651167074),
 Row(movieId=329, userId=1, new_ratings=4.043341642853775),
 Row(movieId=539, userId=1, new_ratings=3.781670437982751),
 Row(movieId=355, userId=1, new_ratings=4.792371118737552)]

In [147]:
#now, we perform the same procedure as task1
#first, we sort the data by timestamp. 
new_ratings_byTime = df.join(final_new_ratings_train, ['userId', 'movieId'])

In [148]:
new_ratings_byTime.take(20)

[Row(userId=2, movieId=1073, ratings=3.0, new_ratings=3.112761261592185),
 Row(userId=5, movieId=562, ratings=5.0, new_ratings=4.726646236437831),
 Row(userId=7, movieId=1288, ratings=4.0, new_ratings=3.387694559581777),
 Row(userId=13, movieId=266, ratings=3.0, new_ratings=3.0448259327020883),
 Row(userId=13, movieId=1466, ratings=4.0, new_ratings=3.7920329822601286),
 Row(userId=13, movieId=2866, ratings=3.0, new_ratings=2.958341527442084),
 Row(userId=17, movieId=1918, ratings=4.0, new_ratings=4.170568373948327),
 Row(userId=23, movieId=296, ratings=4.0, new_ratings=3.1428828214821207),
 Row(userId=26, movieId=185, ratings=4.0, new_ratings=3.991260832923356),
 Row(userId=30, movieId=637, ratings=4.0, new_ratings=3.8491857046397264),
 Row(userId=34, movieId=1089, ratings=3.0, new_ratings=2.964257901448468),
 Row(userId=34, movieId=2089, ratings=3.0, new_ratings=3.8177214235876673),
 Row(userId=34, movieId=2289, ratings=2.0, new_ratings=2.0908386418856337),
 Row(userId=34, movieId=288