In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = \
  '--conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 pyspark-shell'

In [4]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [5]:
sc = SparkContext(appName="BigDataRiver")
sc.setLogLevel("WARN")
sc.setCheckpointDir('checkpoint/')
sql = SQLContext(sc)

In [6]:
def usersWhoBoughtXAlsoBought(df):
    productDf = df.select('user_id', 'product')
    otherProductDf = productDf.toDF('user_id', 'other_product')
    matchedProductsDf = productDf.join(otherProductDf, otherProductDf['user_id'] == productDf['user_id'], 'inner').\
            filter("`product` != `other_product`").select('product','other_product').\
            groupby('product','other_product').count().toDF("product","other_product","count")
    return matchedProductsDf

In [7]:
def selectTopProducts(df):
    df.registerTempTable("products")
    topProductsDf = sql.sql("""
        SELECT
            *,
            ROW_NUMBER() OVER(PARTITION BY product ORDER BY count DESC) rn
        FROM products
    """).where("rn <= 5").groupBy("product").agg(F.collect_list("other_product").alias("other_products"))
    return topProductsDf

In [10]:
def processBatch():
    allUserProductsDf = sql.read.format("org.apache.spark.sql.cassandra").\
        options(table="all_user_products", keyspace="bdr").load().cache()
    
    topDf = selectTopProducts(usersWhoBoughtXAlsoBought(allUserProductsDf))
    
    topDf.show()
            
    topDf.write.format("org.apache.spark.sql.cassandra").\
        mode('append').options(table="top_other_products_batch", keyspace="bdr").save()

In [13]:
processBatch()

+-------+--------------------+
|product|      other_products|
+-------+--------------------+
|     26|            [13, 84]|
|     29|[59, 62, 85, 12, 96]|
|     65|         [0, 96, 29]|
|     19|        [28, 42, 59]|
|     54|         [39, 45, 1]|
|      0|     [96, 9, 29, 65]|
|     22|[98, 62, 61, 59, 10]|
|      7| [17, 39, 47, 77, 6]|
|     77| [17, 55, 6, 20, 99]|
|     50|[47, 40, 12, 62, 14]|
|     94|  [15, 42, 3, 2, 16]|
|     57|[12, 30, 85, 60, 61]|
|     32| [78, 6, 18, 95, 72]|
|     43|            [82, 41]|
|     84|[81, 60, 13, 38, 26]|
|     39|  [47, 48, 7, 54, 1]|
|     98|  [9, 30, 2, 95, 22]|
|     25| [70, 99, 38, 2, 37]|
|     95| [18, 2, 72, 21, 33]|
|     71|         [24, 1, 45]|
+-------+--------------------+
only showing top 20 rows

