# Importing Datasets

In [1]:
from pyspark.sql import SQLContext,Row
from pyspark.sql import functions as F
from pyspark import SparkContext

In [2]:
sqlContext = SQLContext(sc)

In [3]:
dataDir = "/home/harshit/spark-1.6.0-bin-hadoop2.6/Recommendation"

userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|"))
movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|"))
ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t"))

#%%

In [4]:
ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]),
                        movieID = int(x[1]),
                        rating=float(x[2]),
                        timestamp = int(x[3])))
ratingDataDF = sqlContext.createDataFrame(ratingDataDF)

userDataDF = userData.map(lambda x : Row(userID=int(x[0]),
                                        age = int(x[1]),
                                        gender = x[2],
                                        occupation = x[3],
                                        zipcode = x[4]))
userDataDF = sqlContext.createDataFrame(userDataDF)

movieDataDF = movieData.map(lambda x : Row(movieID = int(x[0]),
                                            movieTitle = x[1],
                                            releaseDate = x[2],
                                            videoReleaseDate = x[3],
                                            IMDBurl = x[4],
                                            unknown= int(x[5]),
                                            action = int(x[6]),
                                            adventure = int(x[7]),
                                            animation = int(x[8]),
                                            childrens = int(x[9]),
                                            comedy = int(x[10]),
                                             crime = int(x[11]),
                                             documentary = int(x[12]),
                                             drama = int(x[13]),
                                             fantasy = int(x[14]),
                                             filmNoir = int(x[15]),
                                             horror = int(x[16]),
                                             musical = int(x[17]),
                                             mystery = int(x[18]),
                                             romance = int(x[19]),
                                             sciFi = int(x[20]),
                                             thriller = int(x[21]),
                                             war = int(x[22]),
                                             western = int(x[23])))
movieDataDF = sqlContext.createDataFrame(movieDataDF)

#%%


In [5]:
userDataDF.count()

943

In [6]:
userDataDF.show()

+---+------+-------------+------+-------+
|age|gender|   occupation|userID|zipcode|
+---+------+-------------+------+-------+
| 24|     M|   technician|     1|  85711|
| 53|     F|        other|     2|  94043|
| 23|     M|       writer|     3|  32067|
| 24|     M|   technician|     4|  43537|
| 33|     F|        other|     5|  15213|
| 42|     M|    executive|     6|  98101|
| 57|     M|administrator|     7|  91344|
| 36|     M|administrator|     8|  05201|
| 29|     M|      student|     9|  01002|
| 53|     M|       lawyer|    10|  90703|
| 39|     F|        other|    11|  30329|
| 28|     F|        other|    12|  06405|
| 47|     M|     educator|    13|  29206|
| 45|     M|    scientist|    14|  55106|
| 49|     F|     educator|    15|  97301|
| 21|     M|entertainment|    16|  10309|
| 30|     M|   programmer|    17|  06355|
| 35|     F|        other|    18|  37212|
| 40|     M|    librarian|    19|  02138|
| 42|     F|    homemaker|    20|  95660|
+---+------+-------------+------+-

# Merging Datasets

In [8]:
data = ratingDataDF.join(userDataDF, ratingDataDF.userID==userDataDF.userID, 'inner').drop(userDataDF.userID)

In [9]:
data.show()

+-------+------+---------+------+---+------+----------+-------+
|movieID|rating|timestamp|userID|age|gender|occupation|zipcode|
+-------+------+---------+------+---+------+----------+-------+
|    886|   2.0|881547877|    31| 24|     M|    artist|  10003|
|    484|   5.0|881548030|    31| 24|     M|    artist|  10003|
|    682|   2.0|881547834|    31| 24|     M|    artist|  10003|
|    302|   4.0|881547719|    31| 24|     M|    artist|  10003|
|    135|   4.0|881548030|    31| 24|     M|    artist|  10003|
|    705|   5.0|881548110|    31| 24|     M|    artist|  10003|
|    504|   5.0|881548110|    31| 24|     M|    artist|  10003|
|    498|   4.0|881548111|    31| 24|     M|    artist|  10003|
|    493|   5.0|881548110|    31| 24|     M|    artist|  10003|
|    321|   4.0|881547746|    31| 24|     M|    artist|  10003|
|    514|   5.0|881548030|    31| 24|     M|    artist|  10003|
|    124|   4.0|881548110|    31| 24|     M|    artist|  10003|
|     79|   2.0|881548082|    31| 24|   

In [10]:
data = data.join(movieDataDF,data.movieID==movieDataDF.movieID,"inner").drop(movieDataDF.movieID)

In [11]:
data.columns

['movieID',
 'rating',
 'timestamp',
 'userID',
 'age',
 'gender',
 'occupation',
 'zipcode',
 'IMDBurl',
 'action',
 'adventure',
 'animation',
 'childrens',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmNoir',
 'horror',
 'movieTitle',
 'musical',
 'mystery',
 'releaseDate',
 'romance',
 'sciFi',
 'thriller',
 'unknown',
 'videoReleaseDate',
 'war',
 'western']

# ALS

In [12]:
RDD_data = data.rdd

### Splitting data

In [13]:
train,validation,test = RDD_data.randomSplit([6,2,2],seed=0L)

In [14]:
validation.take(3)

[Row(movieID=31, rating=5.0, timestamp=884131157, userID=435, age=24, gender=u'M', occupation=u'engineer', zipcode=u'60007', IMDBurl=u'http://us.imdb.com/M/title-exact?Crimson%20Tide%20(1995)', action=0, adventure=0, animation=0, childrens=0, comedy=0, crime=0, documentary=0, drama=1, fantasy=0, filmNoir=0, horror=0, movieTitle=u'Crimson Tide (1995)', musical=0, mystery=0, releaseDate=u'01-Jan-1995', romance=0, sciFi=0, thriller=1, unknown=0, videoReleaseDate=u'', war=1, western=0),
 Row(movieID=31, rating=3.0, timestamp=890687473, userID=41, age=33, gender=u'M', occupation=u'engineer', zipcode=u'80525', IMDBurl=u'http://us.imdb.com/M/title-exact?Crimson%20Tide%20(1995)', action=0, adventure=0, animation=0, childrens=0, comedy=0, crime=0, documentary=0, drama=1, fantasy=0, filmNoir=0, horror=0, movieTitle=u'Crimson Tide (1995)', musical=0, mystery=0, releaseDate=u'01-Jan-1995', romance=0, sciFi=0, thriller=1, unknown=0, videoReleaseDate=u'', war=1, western=0),
 Row(movieID=31, rating=4

In [15]:
train_RDD = train.map(lambda x: (x[0],x[3],x[1]))
validation_for_predict_RDD = validation.map(lambda x: (x[0], x[3]))
test_for_predict_RDD = test.map(lambda x: (x[0], x[3]))

In [16]:
validation_for_predict_RDD.take(3)

[(31, 435), (31, 41), (31, 851)]

In [17]:
from pyspark.mllib.recommendation import ALS
import math

seed = 5L
iterations = 2
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    print "Iteration starts..."
    model = ALS.train(train_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation.map(lambda r: ((int(r[0]), int(r[3])), float(r[1]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < min_error:
        min_error = error
        best_rank = rank

print 'The best model was trained with rank %s' % best_rank

Iteration starts...
For rank 4 the RMSE is 1.03349017039
Iteration starts...
For rank 8 the RMSE is 1.02156565372
Iteration starts...
For rank 12 the RMSE is 0.98847546646
The best model was trained with rank 12


In [18]:
predictions.take(3)

[((1100, 201), 2.6758243061720797),
 ((1100, 405), 1.652081205197201),
 ((100, 1), 3.766693784394797)]

In [20]:
rates_and_preds.take(3)

[((492, 474), (4.0, 4.103014935235372)),
 ((414, 488), (2.0, 3.2609309990147572)),
 ((255, 839), (3.0, 3.083994191313232))]

In [21]:
model = ALS.train(train_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test.map(lambda r: ((int(r[0]), int(r[3])), float(r[1]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print 'For testing data the RMSE is %s' % (error)

For testing data the RMSE is 0.986661588982
