#Problem Statement
Write a SQL query to find the cancellation rate of requests made by unbanned users (both client and driver must be unbanned) between Oct 1, 2013 and Oct 3, 2013. The cancellation rate is computed by dividing the number of canceled (by client or driver) requests made by unbanned users by the total number of requests made by unbanned users.

For the above tables, your SQL query should return the following rows with the cancellation rate being rounded to two decimal places.

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# Define schema for Trips table
trips_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("client_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("city_id", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("request_at", StringType(), True)
])

trips_data = [
    (1, 1, 10, 1, 'completed', '2013-10-01'),
    (2, 2, 11, 1, 'cancelled_by_driver', '2013-10-01'),
    (3, 3, 12, 6, 'completed', '2013-10-01'),
    (4, 4, 13, 6, 'cancelled_by_client', '2013-10-01'),
    (5, 1, 10, 1, 'completed', '2013-10-02'),
    (6, 2, 11, 6, 'completed', '2013-10-02'),
    (7, 3, 12, 6, 'completed', '2013-10-02'),
    (8, 2, 12, 12, 'completed', '2013-10-03'),
    (9, 3, 10, 12, 'completed', '2013-10-03'),
    (10, 4, 13, 12, 'cancelled_by_driver', '2013-10-03')
]

trips_df = spark.createDataFrame(trips_data, schema=trips_schema)

# Define schema for Users table
users_schema = StructType([
    StructField("users_id", IntegerType(), True),
    StructField("banned", StringType(), True),
    StructField("role", StringType(), True)
])

users_data = [
    (1, 'No', 'client'),
    (2, 'Yes', 'client'),
    (3, 'No', 'client'),
    (4, 'No', 'client'),
    (10, 'No', 'driver'),
    (11, 'No', 'driver'),
    (12, 'No', 'driver'),
    (13, 'No', 'driver')
]

users_df = spark.createDataFrame(users_data, schema=users_schema)

users_df.display()
trips_df.display()

users_id,banned,role
1,No,client
2,Yes,client
3,No,client
4,No,client
10,No,driver
11,No,driver
12,No,driver
13,No,driver


id,client_id,driver_id,city_id,status,request_at
1,1,10,1,completed,2013-10-01
2,2,11,1,cancelled_by_driver,2013-10-01
3,3,12,6,completed,2013-10-01
4,4,13,6,cancelled_by_client,2013-10-01
5,1,10,1,completed,2013-10-02
6,2,11,6,completed,2013-10-02
7,3,12,6,completed,2013-10-02
8,2,12,12,completed,2013-10-03
9,3,10,12,completed,2013-10-03
10,4,13,12,cancelled_by_driver,2013-10-03


###Pyspark 

In [0]:
from pyspark.sql.functions import col, count, sum, when, round
# Filter users who are not banned
non_banned_users_df = users_df.filter(col("banned") == "No")

# Join filtered users with trips
joined_df = non_banned_users_df.alias("c") \
    .join(trips_df.alias("t"), col("c.users_id") == col("t.client_id"))

# Group by request_at and calculate metrics
metrics_df = joined_df.groupBy("t.request_at") \
    .agg(
        count("c.users_id").alias("countofusers"),
        sum(when(col("t.status").isin("cancelled_by_driver", "cancelled_by_client"), 1).otherwise(0)).alias("cancelled")
    )

# Calculate cancellation percentage
cancellation_percentage_df = metrics_df.select(
    col("request_at"),
    round((col("cancelled") * 1.0 / col("countofusers")) * 100, 2).alias("cancellationpercentage")
)

# Show the result
cancellation_percentage_df.display()

request_at,cancellationpercentage
2013-10-03,50.0
2013-10-01,33.33
2013-10-02,0.0


###Spark SQL

In [0]:
trips_df.createOrReplaceTempView("Trips")
users_df.createOrReplaceTempView("Users")


In [0]:
%sql
WITH cte AS (
    SELECT users_id 
    FROM users 
    WHERE banned = 'No'
),
cte1 AS (
    SELECT 
        COUNT(users_id) AS countofusers, 
        request_at,
        SUM(CASE WHEN status IN ('cancelled_by_driver', 'cancelled_by_client') THEN 1 ELSE 0 END) AS cancelled
    FROM cte c 
    JOIN trips t ON c.users_id = t.client_id 
    GROUP BY request_at
)
SELECT 
    request_at, 
    ROUND((cancelled * 1.0) / countofusers * 100, 2) AS cancellationpercentage
FROM cte1;


request_at,cancellationpercentage
2013-10-03,50.0
2013-10-01,33.33
2013-10-02,0.0
