In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = \
  '--conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 pyspark-shell'

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [3]:
sc = SparkContext(appName="BigDataRiver")
sc.setLogLevel("WARN")
sc.setCheckpointDir('checkpoint/')
sql = SQLContext(sc)

In [4]:
def usersWhoBoughtXAlsoBought(df):
    productDf = df.select('user_id', 'product')
    otherProductDf = productDf.toDF('user_id', 'other_product')
    matchedProductsDf = productDf.join(otherProductDf, otherProductDf['user_id'] == productDf['user_id'], 'inner').\
            filter("`product` != `other_product`").select('product','other_product').\
            groupby('product','other_product').count().toDF("product","other_product","count")
    return matchedProductsDf

In [5]:
def selectTopProducts(df):
    df.registerTempTable("products")
    topProductsDf = sql.sql("""
        SELECT
            *,
            ROW_NUMBER() OVER(PARTITION BY product ORDER BY count DESC) rn
        FROM products
    """).where("rn <= 5").groupBy("product").agg(F.collect_list("other_product").alias("other_products"))
    return topProductsDf

In [6]:
def processBatch():
    allUserProductsDf = sql.read.format("org.apache.spark.sql.cassandra").\
        options(table="all_user_products", keyspace="bdr").load().cache()
    
    topDf = selectTopProducts(usersWhoBoughtXAlsoBought(allUserProductsDf))
    
    topDf.show()
            
    topDf.write.format("org.apache.spark.sql.cassandra").\
        mode('append').options(table="top_other_products_batch", keyspace="bdr").save()

In [7]:
processBatch()

+-------+--------------------+
|product|      other_products|
+-------+--------------------+
|     29|[62, 85, 12, 89, 59]|
|     19|        [28, 42, 59]|
|      0|                 [9]|
|     22|        [61, 62, 59]|
|      7| [17, 39, 47, 77, 6]|
|     77| [17, 55, 6, 20, 99]|
|     50|[40, 62, 14, 83, 75]|
|     57|[61, 29, 12, 59, 85]|
|     32|                [75]|
|     39|             [47, 7]|
|     25| [99, 70, 38, 2, 41]|
|     95|            [18, 21]|
|     71|         [24, 1, 45]|
|      6| [17, 99, 7, 61, 77]|
|     68| [75, 40, 55, 76, 9]|
|     72|            [55, 41]|
|     58|                [82]|
|      9| [76, 40, 5, 75, 68]|
|     51| [75, 5, 76, 40, 68]|
|     17|  [77, 61, 99, 6, 7]|
+-------+--------------------+
only showing top 20 rows

