## Prepare all DataFrames that will be parsed through later 

**install_df**: get all unique installation_id to create installation_list that will later be iterated though to get relevant data for all installations \
**df**: get all BIN_UPDATE ASLog entries where bin mode is PORT \
**timestamp_df**: get local timestamp for each BIN_UPDATE where bin mode is PORT and convert that timestamp to a day of the week

In [0]:
import pandas
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Installation Data").getOrCreate()

install_df = spark.sql("""
                select distinct installation_id
                from test_unify_analytics.bronze.logs_partition_1
            """)
# display(install_df)
installation_list = install_df.toPandas()['installation_id'].tolist()
# print(len(installation_list))

# values[5] == 14 --> bin in PORT
df = spark.sql("""
            select installation_id, tag, unix_timestamp, values
            from test_unify_analytics.bronze.logs_partition_1
            where tag = 242 AND values[5] = 14
        """)

# values[5] == 14 --> bin in PORT
timestamp_df = spark.sql("""
                    select local_installation_timestamp, values[0] as BinNo
                    from test_unify_analytics.bronze.logs_partition_1
                    where tag = 242 AND values[5] = 14
                """)
timestamp_df = timestamp_df.withColumn('dotw', date_format(col('local_installation_timestamp'), 'EEEE'))

In [0]:
import pandas
from pyspark.sql.functions import *
from pyspark.sql.window import Window

dbutils.fs.mkdirs('dbfs:/FileStore/tables/avg_bin_time')
for installation in installation_list:
    df_loop = df.filter((col('installation_id') == installation)) 

    df_loop = df_loop.withColumn('bin num', col('values')[0]) \
                     .withColumn('unix timestamp', col('unix_timestamp'))
    window_spec = Window.partitionBy('bin num').orderBy('unix_timestamp')
    df_loop = df_loop.withColumn('prev_timestamp', lag('unix_timestamp').over(window_spec)) \
                     .withColumn('time diff (sec)', col('unix_timestamp') - col('prev_timestamp'))

    result_df = df_loop.groupBy('bin num').agg(round(mean('time diff (sec)')).alias('avg time between presentations (sec)'))
    result_df = result_df.withColumn('avg time between presentations (min)', round(col('avg time between presentations (sec)') / 60))
            
    pd = result_df.toPandas().sort_values(by=['avg time between presentations (sec)'])
    # display(pd)

    store_filepath = '/dbfs/FileStore/tables/avg_bin_time/' + installation + '.csv'
    pd.to_csv(store_filepath)

In [0]:
import pandas as pd
from datetime import *
from pyspark.sql.functions import *

dbutils.fs.mkdirs('dbfs:/FileStore/tables/dotw')
for installation in installation_list:
    df_dotw = timestamp_df.filter((col('installation_id') == installation)).groupBy(['BinNo', 'dotw']).count()
    # timestamp_df = timestamp_df.groupBy(['BinNo', 'dotw']).count()
    pd_dotw = df_dotw.orderBy(desc("BinNo"), desc("count")).toPandas()

    store_filepath = '/dbfs/FileStore/tables/dotw/' + installation + '.csv'
    pd_dotw.to_csv(store_filepath)

# time_list = timestamp_df.toPandas()['local_installation_timestamp'].tolist()
# print(time_list[0].day_name())
# for i, val in enumerate(time_list):
#     time_list[i] = val.to_pydatetime()
# print(time_list)

# dotw = time_list[0].strftime("%A")
# print(dotw)