In [49]:
from __future__ import print_function

import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType

In [50]:
import pandas as pd

In [51]:
conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)

In [53]:
data_cols = ['userid','movieid','rating','timestamp']
item_cols = ['movieid','movietitle','release date',
'video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller',
'War' ,'Western']
user_cols = ['userid','age','gender','occupation',
'zip code']

users = pd.read_csv('/home/bella/ml-100k/u.user', sep='|',
names=user_cols, encoding='latin-1')

item = pd.read_csv('/home/bella/ml-100k/u.item', sep='|',
names=item_cols, encoding='latin-1')
data = pd.read_csv('/home/bella/ml-100k/u.data', sep='\t',
names=data_cols, encoding='latin-1')

dataset = pd.merge(pd.merge(item, data),users)
df = dataset[['userid','movieid','rating']]
dataframe =sqlContext.createDataFrame(df)

In [54]:
rdd = dataframe.rdd

In [55]:
rdd.collect()

[Row(userid=308, movieid=1, rating=4),
 Row(userid=308, movieid=4, rating=5),
 Row(userid=308, movieid=5, rating=4),
 Row(userid=308, movieid=7, rating=4),
 Row(userid=308, movieid=8, rating=5),
 Row(userid=308, movieid=9, rating=4),
 Row(userid=308, movieid=11, rating=5),
 Row(userid=308, movieid=12, rating=5),
 Row(userid=308, movieid=15, rating=3),
 Row(userid=308, movieid=17, rating=4),
 Row(userid=308, movieid=19, rating=3),
 Row(userid=308, movieid=21, rating=3),
 Row(userid=308, movieid=22, rating=4),
 Row(userid=308, movieid=23, rating=5),
 Row(userid=308, movieid=24, rating=4),
 Row(userid=308, movieid=25, rating=4),
 Row(userid=308, movieid=28, rating=3),
 Row(userid=308, movieid=30, rating=4),
 Row(userid=308, movieid=31, rating=3),
 Row(userid=308, movieid=32, rating=5),
 Row(userid=308, movieid=42, rating=4),
 Row(userid=308, movieid=44, rating=4),
 Row(userid=308, movieid=45, rating=4),
 Row(userid=308, movieid=47, rating=4),
 Row(userid=308, movieid=48, rating=4),
 Row(u

In [56]:
rank = 10
numIterations = 10
model = ALS.train(rdd, rank, numIterations)


In [57]:
# Evaluate the model on training data
testdata = rdd.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = rdd.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 0.482866938217


In [66]:
predictions.take(10)

[((195, 1084), 3.2436152153253905),
 ((58, 1084), 4.027535289101067),
 ((541, 1084), 4.131719831741616),
 ((470, 1084), 3.488885567524587),
 ((682, 1084), 3.130233794880074),
 ((74, 1084), 2.8759141200364375),
 ((316, 1084), 4.174448796575171),
 ((601, 1084), 4.621236848009172),
 ((821, 1084), 4.939130047610788),
 ((330, 1084), 4.2138049218214935)]

In [67]:
ratesAndPreds.take(10)

[((621, 577), (3, 2.5526201383766853)),
 ((877, 727), (4, 3.074980703210063)),
 ((109, 365), (4, 3.2157246022485095)),
 ((254, 622), (4, 3.3793629108665004)),
 ((720, 286), (5, 3.9980674249100767)),
 ((42, 294), (4, 3.920498237306166)),
 ((812, 326), (4, 3.865893000610808)),
 ((690, 1028), (4, 2.6361907203806014)),
 ((321, 357), (4, 3.7774230959635897)),
 ((363, 849), (2, 1.8612352543655915))]

In [48]:
spark.stop()

In [90]:
p_movies =model.recommendProductsForUsers(5)

In [91]:
p_users = model.recommendUsersForProducts(5)

In [92]:
p_movies.collect()

[(451,
  (Rating(user=451, product=769, rating=6.780075034653405),
   Rating(user=451, product=34, rating=6.5905502856861276),
   Rating(user=451, product=1478, rating=6.500841588977142),
   Rating(user=451, product=1297, rating=6.344088448753819),
   Rating(user=451, product=1282, rating=6.271080260282858))),
 (454,
  (Rating(user=454, product=1278, rating=4.693832668565954),
   Rating(user=454, product=1446, rating=4.679378875080206),
   Rating(user=454, product=1242, rating=4.617925768279835),
   Rating(user=454, product=1286, rating=4.55351487067372),
   Rating(user=454, product=1218, rating=4.484985054577612))),
 (147,
  (Rating(user=147, product=1160, rating=7.627161218507422),
   Rating(user=147, product=1167, rating=7.53548753039938),
   Rating(user=147, product=1174, rating=7.390776807190488),
   Rating(user=147, product=1192, rating=7.373379207555097),
   Rating(user=147, product=1005, rating=7.331738123911395))),
 (155,
  (Rating(user=155, product=1643, rating=7.265983221003

In [93]:
df1=sqlContext.createDataFrame(p_movies)

In [88]:
df1.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: struct (nullable = true)
 |    |-- _1: struct (nullable = true)
 |    |    |-- user: long (nullable = true)
 |    |    |-- product: long (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |-- _2: struct (nullable = true)
 |    |    |-- user: long (nullable = true)
 |    |    |-- product: long (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |-- _3: struct (nullable = true)
 |    |    |-- user: long (nullable = true)
 |    |    |-- product: long (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |-- _4: struct (nullable = true)
 |    |    |-- user: long (nullable = true)
 |    |    |-- product: long (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |-- _5: struct (nullable = true)
 |    |    |-- user: long (nullable = true)
 |    |    |-- product: long (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |-- _6: struct (nullable = true)
