In [1]:
import findspark
findspark.init()

from pyspark.mllib.recommendation import ALS, Rating
from pyspark import SparkContext, SQLContext
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)

In [2]:
TRAIN_FILE = "./data//ratings-train.dat/"
VALIDATION_FILE = "./data//ratings-validation.dat/"
TEST_FILE = "./data/ratings-test.dat/"

In [3]:
def prepare_data(data):
    return (
        data
        .map(lambda l: l.split('::'))
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    )  

In [50]:
# Load and parse the data
ratings_train_text = sc.textFile(TRAIN_FILE)
ratings_train = prepare_data(ratings_train_text)


In [5]:
ratings_validation_text = sc.textFile(VALIDATION_FILE)
ratings_validation = prepare_data(ratings_validation_text)

In [51]:
ratings_test_text = sc.textFile(TEST_FILE)
ratings_test = prepare_data(ratings_validation_text)


#### Calculate the general mean u for all ratings

In [9]:
global_mean = ratings_train.map(lambda r: (r[2])).mean()

In [10]:
global_mean

3.5436346666666556

##### calculate item-specific bias, according to the paper we referenced, for each item i, its bias is equal to the summation of difference between all ratings of to the same item and global mean and then the result is divided by the sum of a regulation parameter and the quantity of the ratings.


In [11]:
df = sqlContext.createDataFrame(ratings_train, ['userId', 'movieId', 'ratings'])

In [12]:
df_orderByMovie = df.orderBy(df.movieId)

In [14]:
movie_count = df_orderByMovie.groupBy(df_orderByMovie.movieId).count()

In [15]:
sum_byMovie = df_orderByMovie.groupBy(['movieId']).sum()

In [16]:
drop_column1 = sum_byMovie.drop(sum_byMovie[1])

In [17]:
final_drop = drop_column1.drop(drop_column1[1])

In [18]:
movie_sorted = movie_count.join(final_drop, "movieId")

In [19]:
new_movie_sorted = movie_sorted.orderBy(movie_sorted.movieId)

In [20]:
item_bias = new_movie_sorted.map(lambda r: [r[0], (r[2] - r[1]*global_mean)/(25+r[1])])

In [21]:
new_item_bias = sqlContext.createDataFrame(item_bias, ['movieId', 'item_bias'])

#### Caculate the user-specific bias

In [52]:
df_orderByUser = df.orderBy(df.userId)

In [23]:
contain_itemBias = df_orderByUser.join(new_item_bias, "movieId")

In [24]:
sorted_byUser = contain_itemBias.orderBy(['userId'])

In [25]:
subtraction = sorted_byUser.map(lambda r: [r[1], r[2] - global_mean - r[3]])

In [26]:
user_bias_part1 = sqlContext.createDataFrame(subtraction, ['userId', 'subtraction'])

In [27]:
sum_byUser = user_bias_part1.groupBy(['userId']).sum()

In [28]:
sum_UserCollect = user_bias_part1.groupBy(['userId']).count()

In [29]:
ordered_sum_UserCollect = sum_UserCollect.orderBy(sum_UserCollect.userId)

In [30]:
drop_column2 = sum_byUser.drop(sum_byUser[1])

In [31]:
final_drop2 = drop_column2.orderBy(drop_column2.userId)

In [32]:
user_bias_table = final_drop2.join(ordered_sum_UserCollect, 'userId')

In [33]:
ordered_userBiaTable = user_bias_table.orderBy(user_bias_table.userId)

In [34]:
user_bias = ordered_userBiaTable.map(lambda r: [r[0], r[1]/(10+r[2])])

In [35]:
user_specific_bias = sqlContext.createDataFrame(user_bias, ['userId', 'user_bias'])

In [36]:
merge1 = df_orderByUser.join(user_specific_bias, 'userId')

In [37]:
merge2 = merge1.join(new_item_bias, 'movieId')

In [38]:
new_ratings_train = merge2.map(lambda r: [r[0], r[1], r[2] - r[3] - r[4]])

In [39]:
temp = sqlContext.createDataFrame(new_ratings_train, ['movieId', 'userId', 'new_ratings'])

In [40]:
final_new_ratings_train = temp.orderBy(temp.userId)

In [140]:
final_new_ratings_train.take(10)

[Row(movieId=231, userId=1, new_ratings=4.500706981668868),
 Row(movieId=466, userId=1, new_ratings=4.486594220539655),
 Row(movieId=480, userId=1, new_ratings=3.6659681159289264),
 Row(movieId=292, userId=1, new_ratings=3.9400823655233252),
 Row(movieId=316, userId=1, new_ratings=4.070220460584749),
 Row(movieId=520, userId=1, new_ratings=4.450044040828783),
 Row(movieId=122, userId=1, new_ratings=4.51965651167074),
 Row(movieId=329, userId=1, new_ratings=4.043341642853775),
 Row(movieId=539, userId=1, new_ratings=3.781670437982751),
 Row(movieId=355, userId=1, new_ratings=4.792371118737552)]

In [42]:
#now, we perform the same procedure as task1
#first, we sort the data by timestamp. 
new_ratings_byTime = final_new_ratings_train.join(df, ['userId', 'movieId'])

In [43]:
#example of dataset
new_ratings_byTime.take(20)

[Row(userId=2, movieId=1073, new_ratings=3.1127612615922002, ratings=3.0),
 Row(userId=5, movieId=562, new_ratings=4.726646236437846, ratings=5.0),
 Row(userId=7, movieId=1288, new_ratings=3.3876945595817927, ratings=4.0),
 Row(userId=13, movieId=266, new_ratings=3.0448259327021043, ratings=3.0),
 Row(userId=13, movieId=1466, new_ratings=3.792032982260145, ratings=4.0),
 Row(userId=13, movieId=2866, new_ratings=2.9583415274420997, ratings=3.0),
 Row(userId=17, movieId=1918, new_ratings=4.170568373948343, ratings=4.0),
 Row(userId=23, movieId=296, new_ratings=3.1428828214821367, ratings=4.0),
 Row(userId=26, movieId=185, new_ratings=3.9912608329233716, ratings=4.0),
 Row(userId=30, movieId=637, new_ratings=3.8491857046397424, ratings=4.0),
 Row(userId=34, movieId=1089, new_ratings=2.9642579014484842, ratings=3.0),
 Row(userId=34, movieId=2089, new_ratings=3.817721423587683, ratings=3.0),
 Row(userId=34, movieId=2289, new_ratings=2.09083864188565, ratings=2.0),
 Row(userId=34, movieId=28

In [44]:
new_ratings_byTime = new_ratings_byTime.drop(new_ratings_byTime[3])

In [46]:
def prepare_validation(validation):
    return validation.map(lambda p: (p[0], p[1]))

In [47]:
# Evaluate the model on training data
def train_evaluate_als(train, test, rank, iterations_num, lambda_val):
    model = ALS.train(train, rank, iterations_num, lambda_val)
    predictions = model.predictAll(prepare_validation(test)).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    return MSE

In [48]:
mse = train_evaluate_als(new_ratings_byTime, ratings_test, 20, 10, 0.1)

In [49]:
mse

0.8462550647600592