# Initialization

In [1]:
import pyspark
sparkcontext = pyspark.SparkContext.getOrCreate(conf=(
        pyspark
        .SparkConf()
        .setAppName('RuangDataProject')
        .setMaster('local')
    ))
sparkcontext.setLogLevel("WARN")

spark = pyspark.sql.SparkSession(sparkcontext.getOrCreate())

In [2]:
spark

# Data Load

In [3]:
!ls /resources/data/activity-data/ | head -10

_committed_730451297822678341
part-00000-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00001-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00002-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00003-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00004-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00005-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00006-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00007-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json
part-00008-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json


In [4]:
static = spark.read.json('/resources/data/activity-data/')
dataSchema = static.schema
print(dataSchema)

StructType([StructField('Arrival_Time', LongType(), True), StructField('Creation_Time', LongType(), True), StructField('Device', StringType(), True), StructField('Index', LongType(), True), StructField('Model', StringType(), True), StructField('User', StringType(), True), StructField('gt', StringType(), True), StructField('x', DoubleType(), True), StructField('y', DoubleType(), True), StructField('z', DoubleType(), True)])


In [5]:
static.show(5)

+-------------+-------------------+--------+-----+------+----+-----+------------+-------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|            y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+-------------+------------+
|1424686735011|1424686733015076670|nexus4_1|    3|nexus4|   g|stand|0.0014038086|   0.03147888|  0.01109314|
|1424686735214|1424688581265321168|nexus4_2|   50|nexus4|   g|stand|-0.008926392|  -0.04034424|0.0034332275|
|1424686735420|1424688581471925172|nexus4_2|   91|nexus4|   g|stand|-3.814697E-4|-0.0018920898|-0.015792847|
|1424686735618|1424686733619416269|nexus4_1|  123|nexus4|   g|stand| 3.356934E-4| -0.030471802|-0.025222778|
|1424686735821|1424688581874604615|nexus4_2|  171|nexus4|   g|stand|0.0038909912| -0.013641357|  0.01411438|
+-------------+-------------------+--------+-----+------+----+-----+------------+-------------+------------+
only showing top 5 

In [6]:
static.tail(5)

[Row(Arrival_Time=1424789459822, Creation_Time=1424791305142092507, Device='nexus4_2', Index=396182, Model='nexus4', User='e', gt='bike', x=0.027862549, y=-0.53941345, z=0.23950195),
 Row(Arrival_Time=1424789460017, Creation_Time=1424791305336825173, Device='nexus4_2', Index=396221, Model='nexus4', User='e', gt='bike', x=-1.2346497, y=0.6355133, z=-0.21444702),
 Row(Arrival_Time=1424789460218, Creation_Time=1424791305538241189, Device='nexus4_2', Index=396261, Model='nexus4', User='e', gt='bike', x=-0.85546875, y=-0.82032776, z=-0.5936279),
 Row(Arrival_Time=1424789460424, Creation_Time=1424791305739840310, Device='nexus4_2', Index=396301, Model='nexus4', User='e', gt='bike', x=0.98809814, y=0.65153503, z=0.093170166),
 Row(Arrival_Time=1424789460621, Creation_Time=1424791305941073220, Device='nexus4_2', Index=396341, Model='nexus4', User='e', gt='bike', x=1.1579285, y=-0.37278748, z=0.30252075)]

Metadata for the dataset

| Column | Description |
| --- | ----------- |
| Index         |  The row number.
| Arrival_Time  |  The time the measurement arrived to the sensing application
| Creation_Time |  The timestamp the OS attaches to the sample
| X,Y,Z | The values provided by the sensor for the three axes, X,y,z
| User          |  The user this sample originates from, the users are named a to i.
| Model         |  The phone/watch model this sample originates from
| Device        |  The specific device this sample is from. They are prefixed with the model name and then the number, e.g., nexus4_1 or nexus4_2.
| Gt            |  The activity the user was performing: bike sit, stand, walk, stairsup, stairsdown and null

# Structured Streaming

### Mock File Streaming (Throttle)

In [7]:
streaming = (
    spark
    .readStream
    .schema(dataSchema)
    .option('maxFilesPerTrigger', 1)
    .json('/resources/data/activity-data/')
)

### Simple Aggregations

In [8]:
# set partitions
spark.conf.set('spark.sql.shuffle.partitions', 5)

In [9]:
activityCounts = streaming.select('index').distinct()
activityQuery = (
    activityCounts.writeStream
    .queryName('activity_counts_3')
    .format('memory')
    .outputMode('append')
    .start()
)

# activityQuery.awaitTermination()

In [10]:
# activityQuery.awaitTermination()
activityQuery.stop()

In [11]:
from time import sleep
for x in range(5):
    spark.sql("SELECT COUNT(*) FROM activity_counts_3").show()
    sleep(1)

+--------+
|count(1)|
+--------+
|  353592|
+--------+

+--------+
|count(1)|
+--------+
|  353592|
+--------+

+--------+
|count(1)|
+--------+
|  353592|
+--------+

+--------+
|count(1)|
+--------+
|  353592|
+--------+

+--------+
|count(1)|
+--------+
|  353592|
+--------+



### Complex Aggregation

In [12]:
from pyspark.sql.functions import expr

simpleTransform = (
    streaming
    .withColumn('stairs', expr("gt like '%stairs%'"))
    .where('stairs')
    .where('gt is not null')
    .select('gt', 'model', 'arrival_time', 'creation_time')
    .writeStream
    .queryName('simple_transform')
    .format('memory')
    .outputMode('append')
    .start()
)

In [13]:
for x in range(2):
    spark.sql("SELECT * FROM simple_transform").show()
    sleep(1)

+--------+------+-------------+-------------------+
|      gt| model| arrival_time|      creation_time|
+--------+------+-------------+-------------------+
|stairsup|nexus4|1424687983719|1424687981726802718|
|stairsup|nexus4|1424687984000|1424687982009853255|
|stairsup|nexus4|1424687984404|1424687982411977009|
|stairsup|nexus4|1424687984805|1424687982814351277|
|stairsup|nexus4|1424687985210|1424687983217500861|
|stairsup|nexus4|1424687985620|1424687983620332892|
|stairsup|nexus4|1424687986016|1424687984023164923|
|stairsup|nexus4|1424687986420|1424687984425874884|
|stairsup|nexus4|1424687986820|1424687984828822915|
|stairsup|nexus4|1424687987225|1424687985231654946|
|stairsup|nexus4|1424687987625|1424687985634469017|
|stairsup|nexus4|1424687987992|1424687986002114280|
|stairsup|nexus4|1424687988191|1424689834237427627|
|stairsup|nexus4|1424687988392|1424689834438660537|
|stairsup|nexus4|1424687988592|1424689834640076553|
|stairsup|nexus4|1424687988794|1424689834841675674|
|stairsup|ne

In [14]:
deviceModelStats = (
    streaming
    .groupBy('gt')
    .count()
    .writeStream
    .queryName('device_counts')
    .format('memory')
    .outputMode('complete')
    .start()
)

In [34]:
deviceModelStats.stop()

In [35]:
for x in range(10):
    spark.sql("SELECT * FROM device_counts").show()
    sleep(1)

+----------+------+
|        gt| count|
+----------+------+
|       sit|196927|
|     stand|182165|
|stairsdown|149819|
|      walk|212095|
|  stairsup|167255|
|      null|167168|
|      bike|172762|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|196927|
|     stand|182165|
|stairsdown|149819|
|      walk|212095|
|  stairsup|167255|
|      null|167168|
|      bike|172762|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|196927|
|     stand|182165|
|stairsdown|149819|
|      walk|212095|
|  stairsup|167255|
|      null|167168|
|      bike|172762|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|196927|
|     stand|182165|
|stairsdown|149819|
|      walk|212095|
|  stairsup|167255|
|      null|167168|
|      bike|172762|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|196927|
|     stand|182165|
|stairsdown|1498

In [37]:
spark.stop()