In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 pyspark-shell'

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [None]:
sc = SparkContext(appName="BigDataRiver")
sc.setLogLevel("WARN")
sc.setCheckpointDir('checkpoint/')
sql = SQLContext(sc)

In [None]:
def buildCFModel(train):

    def isProductToRating(productCount, clickCount):
        return (productCount * 3.0) + clickCount
    
    ratings = train.rdd.map(lambda r: Rating(r.user_id, r.product, isProductToRating(r.purchased_count, r.clicked_count)))
    rank = 10
    numIterations = 20
    lambdaFactor = 0.01
    alpha = 0.01
    seed = 42
    return ALS.trainImplicit(ratings, rank, numIterations, alpha, seed=seed)

In [None]:
usersInterests = sql.read.format("org.apache.spark.sql.cassandra").options(table="users_interests", keyspace="bdr").load().cache()
usersInterests.show()

In [None]:
dfModel = buildCFModel(usersInterests.select("user_id","product","clicked_count","purchased_count"))

In [None]:
numberOfRecommendationsRequired = 5
rdd = dfModel.recommendProductsForUsers(numberOfRecommendationsRequired)
#: RDD[(Int, Array[Rating])]
print rdd.take(10)

In [None]:
recommendations = rdd.map(lambda (user,ratings): (user, map(lambda r: r.product, ratings)))
print recommendations.collect()

In [None]:
schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("recommended_products", ArrayType(IntegerType()), False)
])

In [None]:
top5 = sql.createDataFrame(recommendations, schema)
top5.show()
top5.write.format("org.apache.spark.sql.cassandra").mode('append').options(table="cf", keyspace="bdr").save()

In [None]:
dfModel.recommendProducts(2105,5)

## Also calculate the most popular per generic and specific category

In [None]:
purchasedOnly = usersInterests.select("product","generic_cat","specific_cat","purchased_count").where("purchased_count > 0").cache()

In [None]:
generic = purchasedOnly.select("generic_cat","product","purchased_count").groupBy("generic_cat","product").sum("purchased_count").toDF("generic_cat","product","count")
specific = purchasedOnly.select("specific_cat","product","purchased_count").groupBy("specific_cat","product").sum("purchased_count").toDF("specific_cat","product","count")
generic.registerTempTable("generic")
specific.registerTempTable("specific")
specific.show()

In [None]:
top5generic = sql.sql("""
    SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY generic_cat ORDER BY count DESC) rn
    FROM generic
""").where("rn <= 5").groupBy("generic_cat").agg(F.collect_list("product").alias("top_products"))
top5generic.show()
top5generic.write.format("org.apache.spark.sql.cassandra").mode('append').options(table="top_generic", keyspace="bdr").save()


In [None]:
top5specific = sql.sql("""
    SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY specific_cat ORDER BY count DESC) rn
    FROM specific
""").where("rn <= 5").groupBy("specific_cat").agg(F.collect_list("product").alias("top_products"))
top5specific.show(300)
top5specific.write.format("org.apache.spark.sql.cassandra").mode('append').options(table="top_specific", keyspace="bdr").save()