In [58]:
from __future__ import print_function

import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
import MySQLdb


conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)

#USER_ID = 0

In [59]:
BEST_RANK = 15
BEST_ITERATION = 20
BEST_REGULATION = 0.100000

#Rank 15
#Regul 0.100000
#Iter 20
#Dist 1.084007

In [57]:
sc.stop()

In [60]:
ratings ='/home/bella/Downloads/ratings.csv'

In [61]:
jobs ='/home/bella/Downloads/jobs.csv'

In [62]:
dfratings = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load(ratings)

In [63]:
dfratings.show()

+------+-----+------+
|userid|jobid|rating|
+------+-----+------+
|    10|    1|     1|
|    18|    1|     2|
|    13|    1|     1|
|     7|    2|     2|
|     4|    2|     2|
|    13|    2|     3|
|    19|    2|     2|
|    12|    2|     1|
|    11|    2|     1|
|     1|    2|     2|
|    20|    2|     2|
|     2|    2|     4|
|     3|    2|     1|
|     0|    3|     4|
|     4|    3|     5|
|     8|    3|     4|
|     7|    3|     4|
|    10|    3|     5|
|    16|    3|     5|
|    21|    3|     5|
+------+-----+------+
only showing top 20 rows



In [64]:
dfjobs = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load(jobs)

In [65]:
dfjobs.show()

+---+-------+-------------+------+--------+--------------------+
| id|zipcode|         city|radius|category|               items|
+---+-------+-------------+------+--------+--------------------+
|  1|     50|    Vancouver|     3| cottage|  Comfy Quiet Chalet|
|  2|     65|       London|     2| cottage|       Cozy Calm Hut|
|  3|     65|       London|     4|   house| Agreable Calm Place|
|  4|   3400|        Paris|    16|  castle|Colossal Quiet Ch...|
|  5|     50|        Paris|     1| cottage|    Homy Quiet Shack|
|  6|     35|       Dublin|     5|   house|Pleasant Quiet Place|
|  7|   3200|      Seattle|    24|  castle|Vast Peaceful For...|
|  8|   3400|San Francisco|    12|  castle|Giant Quiet Fortress|
|  9|   1500|       London|    20|  castle|Giant Peaceful Pa...|
| 10|    650|     Auckland|     9| mansion|Sizable Calm Coun...|
| 11|     50|    Melbourne|     1| cottage|   Homy Quiet Shanty|
| 12|     90|      Seattle|     2|   house|Beautiful Peacefu...|
| 13|   3300|    Melbourn

In [66]:
dfratings.printSchema()

root
 |-- userid: string (nullable = true)
 |-- jobid: string (nullable = true)
 |-- rating: string (nullable = true)



In [67]:
dfjobs.printSchema()

root
 |-- id: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- radius: string (nullable = true)
 |-- category: string (nullable = true)
 |-- items: string (nullable = true)



In [68]:
# Get all the ratings rows of our user
dfUserRatings  = dfratings.filter(dfratings.userid == USER_ID).rdd.map(lambda r: r.rating).collect()
print(dfUserRatings)

[u'4', u'5', u'5', u'4', u'4', u'4', u'5', u'4', u'4']


In [86]:
# Returns only the accommodations that have not been rated by our user
rddPotential  = dfjobs.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

In [87]:
rddPotential.take(10)

[Row(id=u'1', zipcode=u'50', city=u'Vancouver', radius=u'3', category=u'cottage', items=u'Comfy Quiet Chalet'),
 Row(id=u'2', zipcode=u'65', city=u'London', radius=u'2', category=u'cottage', items=u'Cozy Calm Hut'),
 Row(id=u'3', zipcode=u'65', city=u'London', radius=u'4', category=u'house', items=u'Agreable Calm Place'),
 Row(id=u'6', zipcode=u'35', city=u'Dublin', radius=u'5', category=u'house', items=u'Pleasant Quiet Place'),
 Row(id=u'7', zipcode=u'3200', city=u'Seattle', radius=u'24', category=u'castle', items=u'Vast Peaceful Fortress'),
 Row(id=u'8', zipcode=u'3400', city=u'San Francisco', radius=u'12', category=u'castle', items=u'Giant Quiet Fortress'),
 Row(id=u'9', zipcode=u'1500', city=u'London', radius=u'20', category=u'castle', items=u'Giant Peaceful Palace'),
 Row(id=u'10', zipcode=u'650', city=u'Auckland', radius=u'9', category=u'mansion', items=u'Sizable Calm Country House'),
 Row(id=u'11', zipcode=u'50', city=u'Melbourne', radius=u'1', category=u'cottage', items=u'Homy 

In [89]:
pairsPotential.take(10)

[(0, u'1'),
 (0, u'2'),
 (0, u'3'),
 (0, u'6'),
 (0, u'7'),
 (0, u'8'),
 (0, u'9'),
 (0, u'10'),
 (0, u'11'),
 (0, u'12')]

In [90]:
rddTraining, rddValidating, rddTesting = dfratings.rdd.randomSplit([6,2,2])

model = ALS.train(rddTraining, BEST_RANK, BEST_ITERATION, BEST_REGULATION)

# Calculate all predictions
predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))

# Take the top 5 ones
topPredictions = predictions.takeOrdered(10, key=lambda x: -x[2])
print(topPredictions)


[('0', '6', 4.980718420810856), ('0', '49', 4.930697708110878), ('0', '30', 4.761530993343464), ('0', '12', 4.7031064643388145), ('0', '75', 4.699257084373732), ('0', '76', 4.5257146667411785), ('0', '66', 4.517683683636025), ('0', '61', 4.350737627739173), ('0', '3', 4.3063850570006315), ('0', '59', 4.2360345375641355)]


In [91]:
schema = StructType([StructField("userId", StringType(), True), StructField("jobid", StringType(), True), StructField("prediction", FloatType(), True)])


In [92]:
dfToSave = sqlContext.createDataFrame(topPredictions, schema)

In [93]:
_delimiter=','

In [94]:
_output='/home/bella/Downloads/predictions'

In [95]:
_xy=dfToSave.coalesce(1).write.format('com.databricks.spark.csv').option('header','true').option('delimiter', _delimiter).mode("overwrite").save(_output)