In [1]:
from pyspark.sql import SQLContext,Row
from pyspark.sql import functions as F
from pyspark import SparkContext

sqlContext = SQLContext(sc)

dataDir = "/home/rsk/Documents/RecommenderProject"

userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|"))
movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|"))
ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t"))

In [2]:
ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]),
                        movieID = int(x[1]),
                        rating=float(x[2]),
                        timestamp = int(x[3])))
ratingDataDF = sqlContext.createDataFrame(ratingDataDF)

userDataDF = userData.map(lambda x : Row(userID=int(x[0]),
                                        age = int(x[1]),
                                        gender = x[2],
                                        occupation = x[3],
                                        zipcode = x[4]))
userDataDF = sqlContext.createDataFrame(userDataDF)

movieDataDF = movieData.map(lambda x : Row(movieID = int(x[0]),
                                            movieTitle = x[1],
                                            releaseDate = x[2],
                                            videoReleaseDate = x[3],
                                            IMDBurl = x[4],
                                            unknown= int(x[5]),
                                            action = int(x[6]),
                                            adventure = int(x[7]),
                                            animation = int(x[8]),
                                            childrens = int(x[9]),
                                            comedy = int(x[10]),
                                             crime = int(x[11]),
                                             documentary = int(x[12]),
                                             drama = int(x[13]),
                                             fantasy = int(x[14]),
                                             filmNoir = int(x[15]),
                                             horror = int(x[16]),
                                             musical = int(x[17]),
                                             mystery = int(x[18]),
                                             romance = int(x[19]),
                                             sciFi = int(x[20]),
                                             thriller = int(x[21]),
                                             war = int(x[22]),
                                             western = int(x[23])))
movieDataDF = sqlContext.createDataFrame(movieDataDF)

In [5]:
# timestamp
def extract_datetime(ts):
    import datetime
    return datetime.datetime.fromtimestamp(ts)

####
newdataDF = ratingDataDF.map(lambda x: Row(x[0],x[1],x[2],x[3],extract_datetime(x[2]).day,
                                           extract_datetime(x[2]).month,extract_datetime(x[2]).year,
                                           extract_datetime(x[2]).hour,extract_datetime(x[2]).minute,
                                           extract_datetime(x[2]).second)).toDF()
ratingDataDF = newdataDF.selectExpr("_1 as movieID","_2 as rating","_3 as timestamp","_4 as userID",
                                        "_5 as date","_6 as month","_7 as year",
                                        "_8 as hour","_9 as minute","_10 as second")

In [7]:
newdataDF.show(3)

+---+---+---------+---+---+---+----+---+---+---+
| _1| _2|       _3| _4| _5| _6|  _7| _8| _9|_10|
+---+---+---------+---+---+---+----+---+---+---+
|242|3.0|881250949|196|  4| 12|1997| 21| 25| 49|
|302|3.0|891717742|186|  5|  4|1998|  0| 52| 22|
|377|1.0|878887116| 22|  7| 11|1997| 12| 48| 36|
+---+---+---------+---+---+---+----+---+---+---+
only showing top 3 rows



In [8]:
ratingDataDF.show(3)

+-------+------+---------+------+----+-----+----+----+------+------+
|movieID|rating|timestamp|userID|date|month|year|hour|minute|second|
+-------+------+---------+------+----+-----+----+----+------+------+
|    242|   3.0|881250949|   196|   4|   12|1997|  21|    25|    49|
|    302|   3.0|891717742|   186|   5|    4|1998|   0|    52|    22|
|    377|   1.0|878887116|    22|   7|   11|1997|  12|    48|    36|
+-------+------+---------+------+----+-----+----+----+------+------+
only showing top 3 rows



### Merging Datasets

In [9]:
data = ratingDataDF.join(userDataDF, ratingDataDF.userID==userDataDF.userID, 'inner').drop(userDataDF.userID)

In [10]:
data = data.join(movieDataDF,data.movieID==movieDataDF.movieID,"inner").drop(movieDataDF.movieID)

In [12]:
data.count()

100000

In [14]:
train,test = data.rdd.randomSplit([8,2],seed=45)

In [16]:
test.count()

20109

In [15]:
train.count()

79891

In [237]:
totalRDD  = data.rdd.map(lambda x : (x[0],x[3],x[1]))
trainRDD = train.map(lambda x : (x[0],x[3],x[1]))
testRDD = test.map(lambda x : (x[0],x[3]))

In [20]:
from pyspark.mllib.recommendation import ALS
import math

seed = 5L
iterations = 2
regularization_parameter = 0.1
ranks = [18,15,12]
errors=[0,0,0]

err= 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1


In [238]:
model = ALS.train(totalRDD, rank = 5, seed = seed, iterations = iterations, lambda_ = regularization_parameter)

In [239]:
user_features = model.userFeatures()

In [158]:
import numpy as np

def euclidDist( x,y ):
    return np.sqrt(np.sum(np.subtract(x,y)**2))

In [284]:
def getRecommendations_user(userID,user_features , neighborCount=5,neighborMovieCount=10,moviesToRecommend=10):
    
    feature = user_features.filter( lambda x : x[0]==userID ).take(1)[0][1]
    
    user_features2 = user_features.map(lambda x : (x[0], euclidDist(x[1],feature) )).sortBy(lambda x : x[1])
    neighborList = user_features2.map(lambda x : x[0]).take(neighborCount)
    
    
    
    movieList=[]
    for i in range(len(neighborList)):
        data = ratingDataDF.filter(ratingDataDF['userID']== neighborList[i]).sort("rating",ascending = False).select("movieID")
        newData = data.map(lambda x : x[0]).take(neighborMovieCount)
        movieList += newData
    
    movieList =  sc.parallelize(movieList).map(lambda x : (x,1)).reduceByKey(lambda x,y : x+y).sortBy(lambda x : -x[1]).collect()
    
    recommendList=[]
    for i in range(moviesToRecommend):
        recommendList.append(movieDataDF.filter(movieDataDF['movieID']==movieList[i][0]).select('movieTitle').map(lambda x : x[0]).collect()[0])
    return recommendList
    

In [285]:
a=getRecommendations_user(100,user_features)
a

[100, 463, 517, 492, 505]


[u'Titanic (1997)',
 u'Contact (1997)',
 u'Emma (1996)',
 u'Godfather, The (1972)',
 u'Apostle, The (1997)',
 u'Replacement Killers, The (1998)',
 u"It's a Wonderful Life (1946)",
 u'Killing Fields, The (1984)',
 u'Pulp Fiction (1994)',
 u'Dunston Checks In (1996)']

In [187]:
a=ratingDataDF.filter(ratingDataDF['userID']==196).sort("rating",ascending=False).select("movieID")

In [None]:
def userPreferences(userID, moviesToRecommend):
    
    ratingDataDF.filter(ratingDataDF['userID']==userID).sort("rating")