### Some initial setup for Spark to access Cassandra tables

In [1]:
import json
import time
import pytz
import traceback
import time_uuid
from pytz import timezone
from datetime import datetime
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext, Row
from pyspark import SparkContext, SparkConf
from config import *

In [2]:
sc.stop()

In [3]:
conf = SparkConf()\
    .setAppName(APPNAME)\
    .setMaster(MASTER)\
    .set("spark.cassandra.connection.host", CASSANDRA_HOST)\
    .set("spark.cassandra.connection.port", CASSANDRA_PORT)\
    .set("spark.cassandra.auth.username", CASSANDRA_USERNAME)\
    .set("spark.cassandra.auth.password", CASSANDRA_PASSWORD)

In [4]:
sc = SparkContext(MASTER, APPNAME, conf=conf)
sqlContext = SQLContext(sc)

In [5]:
sqlContext.sql("""CREATE TEMPORARY TABLE %s \
                  USING org.apache.spark.sql.cassandra \
                  OPTIONS ( table "%s", \
                            keyspace "%s", \
                            cluster "Test Cluster", \
                            pushdown "true") \
              """ % (TABLE_EVENT_STORE, TABLE_EVENT_STORE, KEYSPACE))

sqlContext.sql("""CREATE TEMPORARY TABLE %s \
                  USING org.apache.spark.sql.cassandra \
                  OPTIONS ( table "%s", \
                            keyspace "%s", \
                            cluster "Test Cluster", \
                            pushdown "true") \
              """ % (TABLE_QUERYABLE, TABLE_QUERYABLE, KEYSPACE))

DataFrame[]

### Following are the tables available to query from Pyspark

In [11]:
print TABLE_EVENT_STORE
print TABLE_QUERYABLE

events_store
queryable_users


### Some queries on the table "events_store"

In [14]:
sqlContext.sql('SELECT bucket_id, CAST(event_id AS string), event_name, payload FROM events_store LIMIT 5').show()

+----------+--------------------+-------------+--------------------+
| bucket_id|            event_id|   event_name|             payload|
+----------+--------------------+-------------+--------------------+
|2000-11-27|08aad1b4-05c4-11e...|EventName6619|{"city": "Port Sh...|
|2000-11-27|fcb337b6-05c3-11e...|EventName6649|{"city": "Starkfu...|
|2000-11-27|d1ef2850-05c3-11e...|EventName6921|{"city": "North A...|
|2000-11-27|b5bfdbfa-05c3-11e...|EventName3289|{"city": "Kimberl...|
|2000-11-27|9ec366da-05c3-11e...|EventName1161|{"city": "New Mic...|
+----------+--------------------+-------------+--------------------+



In [24]:
sqlContext.sql('SELECT bucket_id, COUNT(*) AS count FROM events_store GROUP BY bucket_id ORDER BY count DESC LIMIT 5').show()

+----------+-----+
| bucket_id|count|
+----------+-----+
|1984-02-13|   16|
|1992-11-26|   16|
|1983-10-13|   16|
|1992-02-28|   16|
|1989-06-11|   15|
+----------+-----+



### Some queries on the table "queryable_users"

In [16]:
sqlContext.sql('SELECT email, city, job, gender, name, zipcode, age FROM queryable_users LIMIT 5').show()

+--------------------+--------------------+--------------------+------+-----------------+-------+---+
|               email|                city|                 job|gender|             name|zipcode|age|
+--------------------+--------------------+--------------------+------+-----------------+-------+---+
|bryantjill@yahoo.com|    Port Shellyshire|Engineer, civil (...|     M|      Sheryl Ford|  50226| 43|
|amandawilson@will...|           Starkfurt|Corporate investm...|     F|  Joseph Mcintosh|  04382| 58|
|   igreene@yahoo.com|North Alexandermouth|Environmental man...|     F|  Christian Drake|  65622| 40|
|christopherthomps...|         Kimberlyton|Medical laborator...|     M|Christopher Green|  19452| 23|
|raymondmcintosh@h...|         New Michael|Radiographer, the...|     F|Stephanie Elliott|  03702| 30|
+--------------------+--------------------+--------------------+------+-----------------+-------+---+



In [19]:
sqlContext.sql('SELECT event_name, COUNT(*) AS count FROM queryable_users GROUP BY event_name ORDER BY count DESC LIMIT 5').show()

+-------------+-----+
|   event_name|count|
+-------------+-----+
|EventName8278|   44|
|EventName4703|   40|
|EventName8275|   40|
|EventName4401|   40|
|EventName1381|   40|
+-------------+-----+



In [20]:
sqlContext.sql('SELECT job, COUNT(*) AS count FROM queryable_users GROUP BY job ORDER BY count DESC LIMIT 5').show()

+--------------------+-----+
|                 job|count|
+--------------------+-----+
|Volunteer coordin...|  196|
|Manufacturing eng...|  196|
|Lecturer, further...|  194|
|     Charity officer|  192|
|Furniture conserv...|  192|
+--------------------+-----+



In [21]:
sqlContext.sql('SELECT email, COUNT(*) AS count FROM queryable_users GROUP BY email ORDER BY count DESC LIMIT 5').show()

+-------------------+-----+
|              email|count|
+-------------------+-----+
|xmiller@hotmail.com|    9|
|   fsmith@gmail.com|    9|
| wsmith@hotmail.com|    9|
| yjohnson@yahoo.com|    8|
| abrown@hotmail.com|    8|
+-------------------+-----+



In [23]:
sqlContext.sql('SELECT city, job FROM queryable_users WHERE email = "xmiller@hotmail.com"').show()

+------------+--------------------+
|        city|                 job|
+------------+--------------------+
|South Joseph|Education officer...|
|     Carlton|Research scientis...|
|   Erinville|Insurance risk su...|
| South Scott|Administrator, sp...|
|South Ronald|Designer, ceramic...|
|Velezborough|Hydrographic surv...|
|East Jeffery|Engineer, electro...|
|  Griffinton|Print production ...|
|  East Emily|Print production ...|
+------------+--------------------+



In [26]:
sqlContext.sql('SELECT email, job FROM queryable_users WHERE job LIKE "%Engineer%" LIMIT 8').show()

+--------------------+--------------------+
|               email|                 job|
+--------------------+--------------------+
|bryantjill@yahoo.com|Engineer, civil (...|
|ian95@caldwell-su...|Engineer, production|
|jeffreywarren@kan...|Engineer, civil (...|
|marcglover@stark.com|Engineer, biomedical|
|   jorge49@gmail.com|Engineer, agricul...|
|jonathan23@jones.com|  Engineer, drilling|
|spotter@stricklan...| Engineer, materials|
|  wjimenez@yahoo.com|    Engineer, energy|
+--------------------+--------------------+

