In [0]:
def create_events_df(spark):
    events_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/tanujkuls@gmail.com/lidl/events.csv")
    return events_df

In [0]:
def create_events_per_10_minutes(events_df: DataFrame) -> DataFrame:
    events_per_10_mins = events_df.groupBy(f.window(col("time"), "10 minutes").alias("events_per_10_mins"), "action").count()
    events_per_10_mins.orderBy("events_per_10_mins")
    return events_per_10_mins

In [0]:
def total_actions_per_10_mins(events_per_10_mins: DataFrame) -> DataFrame:
    actions = events_per_10_mins.select("action").rdd \
        .distinct() \
        .map(lambda row: row[0]) \
        .collect()

    cols = [when(col("action") == action, col("count")).otherwise(0).alias(action) 
        for action in  actions]
    maxs = [max(col(action)).alias(action) for action in actions]


    total_actions_per_10_mins = (events_per_10_mins.select(col("events_per_10_mins"), *cols) \
        .groupBy("events_per_10_mins") \
        .agg(*maxs)) \
        .orderBy("events_per_10_mins")

    return total_actions_per_10_mins

In [0]:
def avg_number_of_actions(total_actions_per_10_mins: DataFrame) -> DataFrame:
    actions = [col("Close"), col("Open")]
    avg_func = sum(x for x in actions)/ len(actions)
    avg_number_of_actions = total_actions_per_10_mins.withColumn("avg_no_of_actions", avg_func)
    return avg_number_of_actions

In [0]:
def events_with_max_open_actions(total_actions_per_10_mins: DataFrame) -> DataFrame:
    max_value = total_actions_per_10_mins.select(f.max("Open")).collect()[0][0]
    events_with_max_open_actions = total_actions_per_10_mins.filter(col("Open")==max_value)
    return events_with_max_open_actions

In [0]:
from typing import List
from pyspark.sql import functions as f
from pyspark.sql.functions import window, col, when, max, dense_rank, desc
from pyspark.sql.window import Window
from pyspark.sql import SparkSession

if __name__ == '__main__':
    # Create Spark Session
    spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
    
    # create events dataframe
    events_df = create_events_df(spark)
    events_df.show(5, False)
    
    # Create events per 10 minutes Dataframe
    events_per_10_mins = create_events_per_10_minutes(events_df)
    events_per_10_mins.show(5, False)
    
    # Create total actions per 10 minutes
    total_actions_per_10_mins = total_actions_per_10_mins(events_per_10_mins)
    total_actions_per_10_mins.show(15, False)
    
    # Average number of actions
    avg_number_of_actions = avg_number_of_actions(total_actions_per_10_mins)
    avg_number_of_actions.show(15, False)
    
    # Event with Maximum Open Actions
    events_with_max_open_actions = events_with_max_open_actions(total_actions_per_10_mins)
    events_with_max_open_actions.show(1, False)

+------------------------+------+
|time                    |action|
+------------------------+------+
|2016-07-28T04:19:28.000Z|Close |
|2016-07-28T04:19:28.000Z|Close |
|2016-07-28T04:19:29.000Z|Open  |
|2016-07-28T04:19:31.000Z|Close |
|2016-07-28T04:19:31.000Z|Open  |
+------------------------+------+
only showing top 5 rows

+------------------------------------------+------+-----+
|events_per_10_mins                        |action|count|
+------------------------------------------+------+-----+
|{2016-07-27 10:00:00, 2016-07-27 10:10:00}|Close |165  |
|{2016-07-28 01:10:00, 2016-07-28 01:20:00}|Close |170  |
|{2016-07-26 15:20:00, 2016-07-26 15:30:00}|Close |166  |
|{2016-07-27 04:50:00, 2016-07-27 05:00:00}|Open  |169  |
|{2016-07-27 18:20:00, 2016-07-27 18:30:00}|Close |193  |
+------------------------------------------+------+-----+
only showing top 5 rows

+------------------------------------------+-----+----+
|events_per_10_mins                        |Close|Open|
+---------