In [0]:
from datetime import datetime
from pyspark.sql.types import *

# Schema for swiggy_orders
swiggy_orders_schema = StructType([
    StructField("orderid", IntegerType(), True),
    StructField("custid", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("del_partner", StringType(), True),
    StructField("order_time", TimestampType(), True),
    StructField("deliver_time", TimestampType(), True),
    StructField("predicted_time", IntegerType(), True)
])

# Data for swiggy_orders (Convert string timestamps to datetime)
swiggy_orders_data = [
    (1, 101, 'Mumbai', 'Partner A', datetime.strptime('2024-12-18 10:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 11:30:00', '%Y-%m-%d %H:%M:%S'), 60),
    (2, 102, 'Delhi', 'Partner A', datetime.strptime('2024-12-18 09:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 10:00:00', '%Y-%m-%d %H:%M:%S'), 45),
    (3, 103, 'Pune', 'Partner A', datetime.strptime('2024-12-18 15:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 15:30:00', '%Y-%m-%d %H:%M:%S'), 30),
    (4, 104, 'Mumbai', 'Partner A', datetime.strptime('2024-12-18 14:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 14:50:00', '%Y-%m-%d %H:%M:%S'), 45),
    (5, 105, 'Bangalore', 'Partner B', datetime.strptime('2024-12-18 08:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 08:29:00', '%Y-%m-%d %H:%M:%S'), 30),
    (6, 106, 'Hyderabad', 'Partner B', datetime.strptime('2024-12-18 13:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 14:00:00', '%Y-%m-%d %H:%M:%S'), 70),
    (7, 107, 'Kolkata', 'Partner B', datetime.strptime('2024-12-18 10:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 10:40:00', '%Y-%m-%d %H:%M:%S'), 45),
    (8, 108, 'Delhi', 'Partner B', datetime.strptime('2024-12-18 18:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 18:30:00', '%Y-%m-%d %H:%M:%S'), 40),
    (9, 109, 'Chennai', 'Partner C', datetime.strptime('2024-12-18 07:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 07:40:00', '%Y-%m-%d %H:%M:%S'), 30),
    (10, 110, 'Mumbai', 'Partner C', datetime.strptime('2024-12-18 12:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 13:00:00', '%Y-%m-%d %H:%M:%S'), 50),
    (11, 111, 'Delhi', 'Partner C', datetime.strptime('2024-12-18 09:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 09:35:00', '%Y-%m-%d %H:%M:%S'), 30),
    (12, 112, 'Hyderabad', 'Partner C', datetime.strptime('2024-12-18 16:00:00', '%Y-%m-%d %H:%M:%S'),
     datetime.strptime('2024-12-18 16:45:00', '%Y-%m-%d %H:%M:%S'), 30)
]

# Create DataFrame
swiggy_orders_df = spark.createDataFrame(swiggy_orders_data, schema=swiggy_orders_schema)
swiggy_orders_df.display()


orderid,custid,city,del_partner,order_time,deliver_time,predicted_time
1,101,Mumbai,Partner A,2024-12-18T10:00:00Z,2024-12-18T11:30:00Z,60
2,102,Delhi,Partner A,2024-12-18T09:00:00Z,2024-12-18T10:00:00Z,45
3,103,Pune,Partner A,2024-12-18T15:00:00Z,2024-12-18T15:30:00Z,30
4,104,Mumbai,Partner A,2024-12-18T14:00:00Z,2024-12-18T14:50:00Z,45
5,105,Bangalore,Partner B,2024-12-18T08:00:00Z,2024-12-18T08:29:00Z,30
6,106,Hyderabad,Partner B,2024-12-18T13:00:00Z,2024-12-18T14:00:00Z,70
7,107,Kolkata,Partner B,2024-12-18T10:00:00Z,2024-12-18T10:40:00Z,45
8,108,Delhi,Partner B,2024-12-18T18:00:00Z,2024-12-18T18:30:00Z,40
9,109,Chennai,Partner C,2024-12-18T07:00:00Z,2024-12-18T07:40:00Z,30
10,110,Mumbai,Partner C,2024-12-18T12:00:00Z,2024-12-18T13:00:00Z,50


In [0]:
from datetime import datetime, date
from pyspark.sql.types import *

# Schema for sales_data
sales_data_schema = StructType([
    StructField("order_date", DateType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("store_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("order_value", IntegerType(), True)
])

# Data for sales_data (Convert string dates to date objects)
sales_data = [
    (date(2024, 12, 1), 109, 1, 3, 2, 700),
    (date(2024, 12, 2), 110, 2, 2, 1, 300),
    (date(2024, 12, 3), 111, 1, 5, 3, 900),
    (date(2024, 12, 4), 112, 3, 1, 2, 500),
    (date(2024, 12, 5), 113, 3, 4, 4, 1200),
    (date(2024, 12, 5), 114, 3, 4, 2, 400),
    (date(2024, 12, 5), 115, 3, 4, 1, 300),
    (date(2024, 12, 1), 101, 1, 4, 2, 500),
    (date(2024, 12, 1), 102, 1, 4, 1, 300),
    (date(2024, 12, 2), 103, 2, 4, 3, 900),
    (date(2024, 12, 2), 104, 2, 4, 1, 400),
    (date(2024, 12, 3), 105, 1, 4, 2, 600),
    (date(2024, 12, 3), 106, 1, 4, 3, 800),
    (date(2024, 12, 4), 107, 3, 4, 1, 200),
    (date(2024, 12, 4), 108, 3, 4, 2, 500)
]

# Create DataFrame
sales_data_df = spark.createDataFrame(sales_data, schema=sales_data_schema)
sales_data_df.display()


order_date,customer_id,store_id,product_id,sale,order_value
2024-12-01,109,1,3,2,700
2024-12-02,110,2,2,1,300
2024-12-03,111,1,5,3,900
2024-12-04,112,3,1,2,500
2024-12-05,113,3,4,4,1200
2024-12-05,114,3,4,2,400
2024-12-05,115,3,4,1,300
2024-12-01,101,1,4,2,500
2024-12-01,102,1,4,1,300
2024-12-02,103,2,4,3,900


In [0]:
from pyspark.sql.functions import col, expr

# Calculate the actual delivery time in minutes and filter where it exceeds the predicted_time
result_df =swiggy_orders_df.withColumn("actual_time", 
        (col("deliver_time").cast("long") - col("order_time").cast("long")) / 60
    ).filter(col("actual_time") > col("predicted_time"))
result_df.display()

orderid,custid,city,del_partner,order_time,deliver_time,predicted_time,actual_time
1,101,Mumbai,Partner A,2024-12-18T10:00:00Z,2024-12-18T11:30:00Z,60,90.0
2,102,Delhi,Partner A,2024-12-18T09:00:00Z,2024-12-18T10:00:00Z,45,60.0
4,104,Mumbai,Partner A,2024-12-18T14:00:00Z,2024-12-18T14:50:00Z,45,50.0
9,109,Chennai,Partner C,2024-12-18T07:00:00Z,2024-12-18T07:40:00Z,30,40.0
10,110,Mumbai,Partner C,2024-12-18T12:00:00Z,2024-12-18T13:00:00Z,50,60.0
11,111,Delhi,Partner C,2024-12-18T09:00:00Z,2024-12-18T09:35:00Z,30,35.0
12,112,Hyderabad,Partner C,2024-12-18T16:00:00Z,2024-12-18T16:45:00Z,30,45.0


In [0]:
# Group by delivery partner and count the orders
grouped_count_df = result_df.groupBy("del_partner").count().withColumnRenamed("count", "exceeded_count")
grouped_count_df.display()

del_partner,exceeded_count
Partner A,3
Partner C,4


In [0]:
from pyspark.sql.functions import col, when, sum

# Calculate the actual delivery time in minutes
result_df = (
    swiggy_orders_df.withColumn(
        "actual_time", 
        (col("deliver_time").cast("long") - col("order_time").cast("long")) / 60
    )
)

# Group by `del_partner` and calculate the sum based on the case condition
grouped_sum_df = (
    result_df.groupBy("del_partner")
    .agg(
        sum(
            when(col("actual_time") > col("predicted_time"), 1).otherwise(0)
        ).alias("exceeded_count")
    )
)
grouped_sum_df.display()


del_partner,exceeded_count
Partner A,3
Partner B,0
Partner C,4


In [0]:
sales_data_df.createOrReplaceTempView('delayed_orders')
swiggy_orders_df.createOrReplaceTempView('swiggy_orders')

In [0]:
%sql
select a.del_partenr. isnull(delayed_oredrs,0) as delayed_orders from (select distinct del_partner from swiggy_orders) a left join (select del_operatoe, count(*) as delayed_oreds from swiggy_orders where datediff(minutr, order_time, deliver_time )>predicted_time group by del_partner ) b on a.del_partenr=b.del_partner



org.apache.spark.sql.AnalysisException: [UC_NOT_ENABLED] Unity Catalog is not enabled on this cluster. SQLSTATE: 56038
	at org.apache.spark.sql.connector.catalog.LookupCatalog$CatalogAndIdentifier$.unapply(LookupCatalog.scala:152)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$$anonfun$apply$21.applyOrElse(Analyzer.scala:2980)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$$anonfun$apply$21.applyOrElse(Analyzer.scala:2973)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:505)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:85)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:505)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$3(TreeNode.scala:510)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1314)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren