In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType
from pyspark.sql.functions import col,lag,avg,round,abs
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema  = StructType([
    StructField("machine_id",IntegerType(),False),
    StructField("process_id",IntegerType(),False),
    StructField("activity_type",StringType(),False),
    StructField("timestamp",FloatType(),False)
])
data = [
    ( 0          , 0          , "start"         , 0.712 )    ,
    ( 0          , 0          , "end"           , 1.520 )    ,
    ( 0          , 1          , "start"         , 3.140 )    ,
    ( 0          , 1          , "end"           , 4.120 )    ,
    ( 1          , 0          , "start"         , 0.550 )    ,
    ( 1          , 0          , "end"           , 1.550 )    ,
    ( 1          , 1          , "start"         , 0.430 )    ,
    ( 1          , 1          , "end"           , 1.420 )    ,
    ( 2          , 0          , "start"         , 4.100 )    ,
    ( 2          , 0          , "end"           , 4.512 )    ,
    ( 2          , 1          , "start"         , 2.500 )    ,
    ( 2          , 1          , "end"           , 5.000 )   
]

activity = spark.createDataFrame(data,schema) 
activity.show()

+----------+----------+-------------+---------+
|machine_id|process_id|activity_type|timestamp|
+----------+----------+-------------+---------+
|         0|         0|        start|    0.712|
|         0|         0|          end|     1.52|
|         0|         1|        start|     3.14|
|         0|         1|          end|     4.12|
|         1|         0|        start|     0.55|
|         1|         0|          end|     1.55|
|         1|         1|        start|     0.43|
|         1|         1|          end|     1.42|
|         2|         0|        start|      4.1|
|         2|         0|          end|    4.512|
|         2|         1|        start|      2.5|
|         2|         1|          end|      5.0|
+----------+----------+-------------+---------+



In [0]:
# There is a factory website that has several machines each running the same number of processes. Write a solution to find the average time each machine takes to complete a process.
# The time to complete a process is the 'end' timestamp minus the 'start' timestamp. The average time is calculated by the total time to complete every process on the machine divided by the number of processes that were run.
# The resulting table should have the machine_id along with the average time as processing_time, which should be rounded to 3 decimal places.
# Return the result table in any order.

# Using Window Function
window_specs = Window.partitionBy("machine_id","process_id").orderBy("machine_id","process_id")
activity.withColumn("start",lag("timestamp").over(window_specs)).filter(col("activity_type")=="end")\
    .withColumn("time_taken",col("timestamp")-col("start"))\
    .groupBy("machine_id").agg(round(avg("time_taken"),3).alias("processing_time")).show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         1|          0.995|
|         2|          1.456|
|         0|          0.894|
+----------+---------------+



In [0]:
# Using Inner Join
a1 = activity.alias("a1")
a2 = activity.alias("a2")
a1.join(a2,(\
        (col("a1.machine_id")==col("a2.machine_id")) & \
        (col("a1.process_id")==col("a2.process_id")) & \
        (col("a1.activity_type")!=col("a2.activity_type")) &\
        (col("a1.activity_type")=="end"))\
    ,"inner")\
    .select(a1.machine_id,col("a1.timestamp").alias("endtime"),col("a2.timestamp").alias("starttime"))\
    .withColumn("time",col("endtime")-col("starttime"))\
    .groupBy("machine_id").agg(round(avg("time"),3).alias("processing_time"))\
    .show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         1|          0.995|
|         2|          1.456|
|         0|          0.894|
+----------+---------------+



In [0]:
activity.createOrReplaceTempView("a")
spark.sql("""select a1.machine_id , round(avg(a1.timestamp - a2.timestamp),3) as processing_time 
         from a a1 join a a2 
         on a1.machine_id=a2.machine_id and 
            a1.process_id=a2.process_id and 
            a1.activity_type<>a2.activity_type and 
            a1.activity_type='end'
        group by 1     
         """).show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         1|          0.995|
|         2|          1.456|
|         0|          0.894|
+----------+---------------+



In [0]:
spark.sql("""
          with cte as (
                select machine_id,process_id,`timestamp` as end_time, lag(`timestamp`) over(partition by machine_id,process_id order by activity_type) as start_time from a order by machine_id,process_id )
        select machine_id, abs(round(avg(end_time-start_time),3)) processing_time from cte where start_time is not null group by 1
          """).show()

+----------+---------------+
|machine_id|processing_time|
+----------+---------------+
|         1|          0.995|
|         2|          1.456|
|         0|          0.894|
+----------+---------------+



In [0]:
spark.stop()