In [1]:
df = spark.read.option('basePath', 'tss/').parquet('tss/timestamp_interval=1564850000')

In [28]:
df.show()

+--------------------+--------------------+--------------------+---------------+------------------+-----------------+------------------+
|           tablet_id|            activity|         coordinates|timestamp_first|global_granularity|inner_granularity|timestamp_interval|
+--------------------+--------------------+--------------------+---------------+------------------+-----------------+------------------+
|[CH184703920, CH1...|[Rue, Métro, Rue,...|[[48.8919, 2.2318...|     1566878400|             month|           minute|        1564850000|
+--------------------+--------------------+--------------------+---------------+------------------+-----------------+------------------+



In [29]:
df.printSchema()

root
 |-- tablet_id: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- activity: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- coordinates: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp_first: string (nullable = true)
 |-- global_granularity: string (nullable = true)
 |-- inner_granularity: string (nullable = true)
 |-- timestamp_interval: integer (nullable = true)



In [30]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [31]:
step = df.select('inner_granularity').first()[0]
first_timestamp = df.select('timestamp_first').first()[0]
print(step, first_timestamp)

minute 1566878400


In [50]:
df1 = df.withColumn('new', arrays_zip('activity', 'tablet_id', 'coordinates'))

In [51]:
df1 = df1.select('timestamp_first', 'new')

In [52]:
df1 = df1.withColumn('new', explode('new'))
df1.show()

+---------------+--------------------+
|timestamp_first|                 new|
+---------------+--------------------+
|     1566878400|[Rue, CH184703920...|
|     1566878400|[Métro, CH1847039...|
|     1566878400|[Rue, CH184703920...|
|     1566878400|[Bureau, CH184703...|
|     1566878400|[Rue, CH184703920...|
|     1566878400|[Restaurant, CH18...|
|     1566878400|[Rue, CH184703920...|
|     1566878400|[Voiture, CH18470...|
|     1566878400|[Bureau, CH184703...|
|     1566878400|[Rue, CH184703920...|
|     1566878400|[Magasin, CH18470...|
|     1566878400|[Bureau, CH184703...|
|     1566878400|[Rue, CH184703916...|
|     1566878400|[Deux Roues, CH18...|
|     1566878400|[Domicile, CH1847...|
|     1566878400|[Voiture, CH18470...|
|     1566878400|[Domicile, CH1847...|
|     1566878400|[Domicile, CH1847...|
|     1566878400|[Domicile, CH1847...|
|     1566878400|[Voiture, CH18470...|
+---------------+--------------------+
only showing top 20 rows



In [55]:
columns = ['activity', 'tablet_id', 'coordinates']
for c in columns:
    column_name = 'new.' + c
    df1 = df1.withColumn(c, col(column_name))
df1 = df1.drop('new')
df1.show()

+---------------+----------+-----------+--------------------+
|timestamp_first|  activity|  tablet_id|         coordinates|
+---------------+----------+-----------+--------------------+
|     1566878400|       Rue|CH184703920|[48.8919, 2.23187...|
|     1566878400|     Métro|CH184703920|[48.8901033333333...|
|     1566878400|       Rue|CH184703920|[48.890005, 2.230...|
|     1566878400|    Bureau|CH184703920|[48.891835, 2.231...|
|     1566878400|       Rue|CH184703920|[48.8910483333333...|
|     1566878400|Restaurant|CH184703920|[48.8906816666667...|
|     1566878400|       Rue|CH184703920|[48.8923516666667...|
|     1566878400|   Voiture|CH184703920|[48.8914583333333...|
|     1566878400|    Bureau|CH184703920|[48.8910716666667...|
|     1566878400|       Rue|CH184703920|[48.8916033333333...|
|     1566878400|   Magasin|CH184703920|[48.8916433333333...|
|     1566878400|    Bureau|CH184703920|[48.891415, 2.231...|
|     1566878400|       Rue|CH184703916|[48.8613864, 2.40...|
|     15

In [56]:
# .select(
#     'timestamp_first', col('new.activity').alias('activity'), col('new.tablet_id').alias('tablet_id'),
#     col('new.coordinates').alias('coordinates'))
# df1.show(5, False)
df1.columns

['timestamp_first', 'activity', 'tablet_id', 'coordinates']

In [22]:
df1 = df1.withColumn('counter', monotonically_increasing_id())

In [23]:
def calculate_step_in_seconds(step):
    if step == 'year':
        inSec = 31536000
    elif step == 'month':
        inSec = 2630000
    elif step == 'week':
        inSec = 604800
    elif step == 'day':
        inSec = 86400
    elif step == 'hour':
        inSec = 3600
    elif step == 'minute':
        inSec = 60
    elif step == 'second':
        inSec = 1
    return inSec
inSec = calculate_step_in_seconds(step)

In [24]:
df1 = df1.withColumn('inSec', lit(inSec))

In [25]:
df1.show(5)

+---------------+--------+-----------+--------------------+-------+-----+
|timestamp_first|activity|  tablet_id|         coordinates|counter|inSec|
+---------------+--------+-----------+--------------------+-------+-----+
|     1566878400|     Rue|CH184703920|[48.8919, 2.23187...|      0|   60|
|     1566878400|   Métro|CH184703920|[48.8901033333333...|      1|   60|
|     1566878400|     Rue|CH184703920|[48.890005, 2.230...|      2|   60|
|     1566878400|  Bureau|CH184703920|[48.891835, 2.231...|      3|   60|
|     1566878400|     Rue|CH184703920|[48.8910483333333...|      4|   60|
+---------------+--------+-----------+--------------------+-------+-----+
only showing top 5 rows



In [26]:
df1 = df1.withColumn('timestamp', (df1.timestamp_first + (df1.counter * df1.inSec)).cast('long'))
df1 = df1.drop('timestamp_first', 'counter', 'inSec')
df1.show(5)

+--------+-----------+--------------------+----------+
|activity|  tablet_id|         coordinates| timestamp|
+--------+-----------+--------------------+----------+
|     Rue|CH184703920|[48.8919, 2.23187...|1566878400|
|   Métro|CH184703920|[48.8901033333333...|1566878460|
|     Rue|CH184703920|[48.890005, 2.230...|1566878520|
|  Bureau|CH184703920|[48.891835, 2.231...|1566878580|
|     Rue|CH184703920|[48.8910483333333...|1566878640|
+--------+-----------+--------------------+----------+
only showing top 5 rows



In [37]:
def unpack(base_dir, paths):
    p = [base_dir + "/" + path for path in paths]
    df = spark.read.option('basePath', base_dir + '/').parquet(*p).sort('timestamp_interval')
    step = df.select('inner_granularity').first()[0]
    first_timestamp = df.select('timestamp_first').first()[0]
    df1 = df.withColumn('new', arrays_zip('activity', 'tablet_id', 'coordinates'))
    df1 = df1.select('timestamp_first', 'new')
    df1 = df1.withColumn('new', explode('new')).select(
    'timestamp_first', col('new.activity').alias('activity'), col('new.tablet_id').alias('tablet_id'),
    col('new.coordinates').alias('coordinates'))
    df1 = df1.withColumn('counter', monotonically_increasing_id())
    inSec = calculate_step_in_seconds(step)
    df1 = df1.withColumn('inSec', lit(inSec))
    df1 = df1.withColumn('timestamp', (df1.timestamp_first + (df1.counter * df1.inSec)).cast('long'))
    df1 = df1.drop('timestamp_first', 'counter', 'inSec')
    return df1