In [1]:
import findspark
findspark.init()

from pyspark.mllib.recommendation import ALS, Rating
from pyspark import SparkContext, SQLContext
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)

In [2]:
TRAIN_FILE = "./data//ratings-train.dat/"
VALIDATION_FILE = "./data//ratings-validation.dat/"
TEST_FILE = "./data/ratings-test.dat/"

In [3]:
def prepare_data(data):
    return (
        data
        .map(lambda l: l.split('::'))
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    )  

In [4]:
# Load and parse the data
ratings_train_text = sc.textFile(TRAIN_FILE)
ratings_train = prepare_data(ratings_train_text)


In [5]:
ratings_validation_text = sc.textFile(VALIDATION_FILE)
ratings_validation = prepare_data(ratings_validation_text)

In [6]:
ratings_test_text = sc.textFile(TEST_FILE)
ratings_test = prepare_data(ratings_validation_text)


#### Calculate the general mean u for all ratings

In [15]:
global_mean = ratings_train.map(lambda r: (r[2])).mean()

In [16]:
global_mean

3.5436346666666556

##### calculate item-specific bias, according to the paper we referenced, for each item i, its bias is equal to the summation of difference between all ratings of to the same item and global mean and then the result is divided by the sum of a regulation parameter and the quantity of the ratings.


In [17]:
#convert training data to dataframe with attribute
df = sqlContext.createDataFrame(ratings_train, ['userId', 'movieId', 'ratings'])

In [18]:
#sort the data by movie
df_orderByMovie = df.orderBy(df.movieId)

In [19]:
#group the movie and count each movie
movie_count = df_orderByMovie.groupBy(df_orderByMovie.movieId).count()

In [20]:
#calculate the sum of the ratings of each movie
sum_byMovie = df_orderByMovie.groupBy(['movieId']).sum()

In [21]:
#drop some unrelated column
drop_column1 = sum_byMovie.drop(sum_byMovie[1])
final_drop = drop_column1.drop(drop_column1[1])

In [22]:
#join the sum of count and sum of rating for each movie
movie_sorted = movie_count.join(final_drop, "movieId")

In [23]:
#sorted the dataset by each movie
new_movie_sorted = movie_sorted.orderBy(movie_sorted.movieId)

In [24]:
#calculate item specific bias
item_bias = new_movie_sorted.map(lambda r: [r[0], (r[2] - r[1]*global_mean)/(25+r[1])])

In [25]:
new_item_bias = sqlContext.createDataFrame(item_bias, ['movieId', 'item_bias'])

#### Caculate the user-specific bias

In [26]:
#order the training set by user
df_orderByUser = df.orderBy(df.userId)

In [27]:
#join the item bias dataset to with the same movieId
contain_itemBias = df_orderByUser.join(new_item_bias, "movieId")

In [28]:
#sorted the dataset by user
sorted_byUser = contain_itemBias.orderBy(['userId'])

In [29]:
#calculate the numerical part of item specific bais
subtraction = sorted_byUser.map(lambda r: [r[1], r[2] - global_mean - r[3]])

In [30]:
user_bias_part1 = sqlContext.createDataFrame(subtraction, ['userId', 'subtraction'])

In [31]:
sum_byUser = user_bias_part1.groupBy(['userId']).sum()

In [32]:
#count the user 
sum_UserCollect = user_bias_part1.groupBy(['userId']).count()

In [33]:
#order the data set by user
ordered_sum_UserCollect = sum_UserCollect.orderBy(sum_UserCollect.userId)

In [34]:
drop_column2 = sum_byUser.drop(sum_byUser[1])

In [35]:
final_drop2 = drop_column2.orderBy(drop_column2.userId)

In [36]:
user_bias_table = final_drop2.join(ordered_sum_UserCollect, 'userId')

In [37]:
ordered_userBiaTable = user_bias_table.orderBy(user_bias_table.userId)

In [38]:
user_bias = ordered_userBiaTable.map(lambda r: [r[0], r[1]/(10+r[2])])

In [39]:
user_specific_bias = sqlContext.createDataFrame(user_bias, ['userId', 'user_bias'])

In [40]:
merge1 = df_orderByUser.join(user_specific_bias, 'userId')

In [41]:
merge2 = merge1.join(new_item_bias, 'movieId')

In [42]:
new_ratings_train = merge2.map(lambda r: [r[0], r[1], r[2] - r[3] - r[4]])

In [43]:
temp = sqlContext.createDataFrame(new_ratings_train, ['movieId', 'userId', 'new_ratings'])

In [44]:
final_new_ratings_train = temp.orderBy(temp.userId)

In [140]:
final_new_ratings_train.take(10)

[Row(movieId=231, userId=1, new_ratings=4.500706981668868),
 Row(movieId=466, userId=1, new_ratings=4.486594220539655),
 Row(movieId=480, userId=1, new_ratings=3.6659681159289264),
 Row(movieId=292, userId=1, new_ratings=3.9400823655233252),
 Row(movieId=316, userId=1, new_ratings=4.070220460584749),
 Row(movieId=520, userId=1, new_ratings=4.450044040828783),
 Row(movieId=122, userId=1, new_ratings=4.51965651167074),
 Row(movieId=329, userId=1, new_ratings=4.043341642853775),
 Row(movieId=539, userId=1, new_ratings=3.781670437982751),
 Row(movieId=355, userId=1, new_ratings=4.792371118737552)]

In [45]:
#now, we perform the same procedure as task1
#first, we sort the data by timestamp. 
new_ratings_byTime = final_new_ratings_train.join(df, ['userId', 'movieId'])

In [43]:
#example of dataset
new_ratings_byTime.take(20)

[Row(userId=2, movieId=1073, new_ratings=3.1127612615922002, ratings=3.0),
 Row(userId=5, movieId=562, new_ratings=4.726646236437846, ratings=5.0),
 Row(userId=7, movieId=1288, new_ratings=3.3876945595817927, ratings=4.0),
 Row(userId=13, movieId=266, new_ratings=3.0448259327021043, ratings=3.0),
 Row(userId=13, movieId=1466, new_ratings=3.792032982260145, ratings=4.0),
 Row(userId=13, movieId=2866, new_ratings=2.9583415274420997, ratings=3.0),
 Row(userId=17, movieId=1918, new_ratings=4.170568373948343, ratings=4.0),
 Row(userId=23, movieId=296, new_ratings=3.1428828214821367, ratings=4.0),
 Row(userId=26, movieId=185, new_ratings=3.9912608329233716, ratings=4.0),
 Row(userId=30, movieId=637, new_ratings=3.8491857046397424, ratings=4.0),
 Row(userId=34, movieId=1089, new_ratings=2.9642579014484842, ratings=3.0),
 Row(userId=34, movieId=2089, new_ratings=3.817721423587683, ratings=3.0),
 Row(userId=34, movieId=2289, new_ratings=2.09083864188565, ratings=2.0),
 Row(userId=34, movieId=28

In [46]:
new_ratings_byTime = new_ratings_byTime.drop(new_ratings_byTime[3])

In [16]:
def prepare_validation(validation):
    return validation.map(lambda p: (p[0], p[1]))

In [15]:
import math

In [12]:
# Evaluate the model on training data

def train_evaluate_als(train, validation, rank, iterations_num, lambda_val):
    model = ALS.train(train, rank, iterations_num, lambda_val)
    predictions = model.predictAll(prepare_validation(test)).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    return MSE

In [9]:
ranks = [10, 20, 30, 40, 50]
lambda_values = [0.01,0.1,1.0,10.0]
ITERATIONS = 10

In [13]:
def report_mse_results(rank, lambda_value, mse, rmse):
    print("Rank=%d, Lambda=%0.2f, MSE=%s, RMSE=%s" % (rank, lambda_value, mse))

In [14]:
def evaluate_parameters(train, validation, ranks, lambda_values):
    for r in ranks:
        for l in lambda_values:
            mse= train_evaluate_als(new_ratings_byTime, validation, r, ITERATIONS, l)
            report_mse_results(r, l, mse)

In [7]:
evaluate_parameters(new_ratings_byTime, ratings_validation, ranks, lambda_values)

Traceback (most recent call last):
  File "/Users/haiweisu/anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1118, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/haiweisu/anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 300, in wrapped
    return f(*args, **kwargs)
  File "/Users/haiweisu/anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 345, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/haiweisu/anaconda/lib/python2.7/inspect.py", line 1049, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/Users/haiweisu/anaconda/lib/python2.7/inspect.py", line 1009, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/Users/haiweisu/anaconda/lib/python2.7/inspect.py", line 454, in getsourcefile
    if hasattr(getmodule(object, filename), '__loader

ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


Unfortunately, your original traceback can not be constructed.



IndexError: string index out of range