In [1]:
sc

In [2]:
#import os
#os.chdir("/home/admin2/activity_description/data/")

In [3]:
from pyspark.sql.types import StringType, StructField, StructType, TimestampType, ArrayType, IntegerType, LongType, FloatType

event_schema = StructType([
    StructField("Index", IntegerType()),
    StructField("Arrival_Time", StringType()),
    StructField("Creation_Time", StringType()),
    StructField("x", FloatType()),
    StructField("y", FloatType()),
    StructField("z", FloatType()),
    StructField("User", StringType()),
    StructField("Model", StringType()),
    StructField("Device", StringType()),
    StructField("gt", StringType())
])

In [4]:
fp = "gs://dataproc-81e79564-8ae2-4a68-9b71-fbaaa2a28716-asia-south1/spark_data/Phones_accelerometer.csv"
df = spark.read.csv(fp, header = True, schema = event_schema)

In [5]:
type(df)

pyspark.sql.dataframe.DataFrame

In [6]:
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Creation_Time: string (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- User: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Device: string (nullable = true)
 |-- gt: string (nullable = true)



In [7]:
from pyspark.sql import functions as f
from pyspark.sql import types as t

format = "yyyy-MM-dd HH:mm:ss"
df = df.withColumn('Arrival_Time_TS', f.from_unixtime(df.Arrival_Time/1000).cast(TimestampType()))
df = df.withColumn('Creation_Time_TS', f.from_unixtime(df.Creation_Time/1000).cast(TimestampType()))

In [8]:
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Creation_Time: string (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- User: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Device: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- Arrival_Time_TS: timestamp (nullable = true)
 |-- Creation_Time_TS: timestamp (nullable = true)



In [9]:
cols_to_drop = ["Arrival_Time", "Creation_Time"]
df = df.drop(*cols_to_drop)

In [10]:
df.show()

+-----+----------+----------+---------+----+------+--------+-----+-------------------+----------------+
|Index|         x|         y|        z|User| Model|  Device|   gt|    Arrival_Time_TS|Creation_Time_TS|
+-----+----------+----------+---------+----+------+--------+-----+-------------------+----------------+
|    0| -5.958191| 0.6880646| 8.135345|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    1|  -5.95224| 0.6702118| 8.136536|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    2|-5.9950867| 0.6535492| 8.204376|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    3|-5.9427185| 0.6761627| 8.128204|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    4| -5.991516|0.64164734| 8.135345|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    5| -5.965332| 0.6297455| 8.128204|   a|nexus4|nexus4_1|stand|2015-02-23 13:03:53|            null|
|    6| -5.991516| 0.6356964|  8.16272|   a|nexus4|nexus4_1|stan

In [11]:
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- User: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Device: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- Arrival_Time_TS: timestamp (nullable = true)
 |-- Creation_Time_TS: timestamp (nullable = true)



In [12]:
from pyspark.sql import SQLContext, Row
df.registerTempTable("activities")
sqlContext = SQLContext(sc)
df_user_e = sqlContext.sql("SELECT * FROM activities WHERE User == 'e'")

In [13]:
type(df_user_e)

pyspark.sql.dataframe.DataFrame

In [14]:
df_user_e.show()

+-----+----------+-----------+----------+----+------+--------+-----+-------------------+----------------+
|Index|         x|          y|         z|User| Model|  Device|   gt|    Arrival_Time_TS|Creation_Time_TS|
+-----+----------+-----------+----------+----+------+--------+-----+-------------------+----------------+
|    0|-2.0448608|-0.23313904|  9.965851|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    1|-2.0412903|-0.29740906| 9.9765625|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    2| -2.065094| -0.2771759| 10.005127|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    3|-2.0853271|-0.27360535|  9.926575|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    4|-2.0567627|-0.21290588|  9.992035|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    5| -2.057953|-0.22480774|  9.986084|   e|nexus4|nexus4_1|stand|2015-02-24 14:13:09|            null|
|    6|-2.0960388|-0.20933533| 10.006317|   e|