# Spark Notebook

### Create a Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder.appName("Activity Tracker").getOrCreate())


24/01/18 11:54:51 WARN Utils: Your hostname, Niharikas-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.77 instead (on interface en0)
24/01/18 11:54:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/18 11:54:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Source the input

In [4]:
# in Python
static = spark.read.json("Data/Spark/data/activity-data/")
static.printSchema()
static.show()
dataSchema = static.schema





root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)

+-------------+-------------------+--------+-----+------+----+-----+-------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|            x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+-------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand|  3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand| -0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|    0.0078125|-0.

                                                                                

In [5]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
  .json("Data/Spark/data/activity-data/")

In [6]:
activityCounts = streaming.groupBy("gt").count()

### Partitions

In [7]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

### Write the data

In [8]:
activityQuery = activityCounts.writeStream.queryName("activity_counts")\
  .format("memory").outputMode("complete")\
  .start()

24/01/18 11:54:59 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/tw/0sj1sbk54zjft6x3s_xgp0r00000gp/T/temporary-0572b2d4-8fc8-4e9d-8030-08e7294fa479. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/18 11:54:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [9]:
# activityQuery.awaitTermination()


### Validate the streams

In [10]:
spark.streams.active


[<pyspark.sql.streaming.query.StreamingQuery at 0x107257b50>]

### Count using SQL

In [11]:
from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|12309|
|     stand|11384|
|stairsdown| 9365|
|      walk|13256|
|  stairsup|10452|
|      null|10449|
|      bike|10796|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|73855|
|     stand|68309|
|stairsdown|56192|
|      walk|79536|
|  stairsup|62710|
|      null|62688|
|      bike|64781|
+----------+-----+

+----------+------+
|        gt| count|
+----------+------+
|       sit|123085|
|     stand|113849|
|stairsdown| 93648|
|      walk|132560|
|  stairsup|104521|
|      null|104482|
|      bike|107974|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|184620|
|     stand|170778|
|stairsdown|140456|
|      walk|198839|
|  stairsup|156800|
|      null|156721|
|      bike|161965|
+----------+------+



In [12]:
from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
  .where("stairs")\
  .where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")\
  .writeStream\
  .queryName("simple_transform")\
  .format("memory")\
  .outputMode("append")\
  .start()

24/01/18 11:55:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/tw/0sj1sbk54zjft6x3s_xgp0r00000gp/T/temporary-287a9c08-a159-4918-8d00-cab66f73c0be. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/18 11:55:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [13]:
deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
  .drop("avg(Index)")\
  .writeStream.queryName("device_counts").format("memory")\
  .outputMode("complete")\
  .start()

24/01/18 11:55:05 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/tw/0sj1sbk54zjft6x3s_xgp0r00000gp/T/temporary-6108089b-7963-446a-bb46-e924a019f532. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/18 11:55:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


### device count

In [16]:
spark.sql("SELECT * FROM device_counts").show()

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|       sit|  NULL| -5.4943324403959E-4|2.791446281700071E-4|-2.33994461689892...|
|      walk|nexus4|-0.00390116006094...|0.001052508689953...|-6.95435553042998...|
|      walk|  NULL|-0.00390116006094...|0.001052508689953...|-6.95435553042998...|
|  stairsup|  NULL|-0.02479965287771643|-0.00800392344379...|-0.10034088415060415|
|     stand|  NULL|-3.11082189691727...|3.218461665975321...|2.141300040636463...|
|      bike|  NULL|0.022688759550866838|-0.00877912156368...|-0.08251001663412372|
|  stairsup|nexus4|-0.02479965287771643|-0.00800392344379...|-0.10034088415060415|
|      NULL|nexus4|4.796918779024287E-4|-0.00601540958963...|-0.01013356489164...|
|      NULL|  NULL|4.796918779024287E-4|-0.00601540958963...|-0.01013356489164...|
|sta