Problem Statements:

Given two datasets, trades and users, the goal is to identify the number of completed orders for each city. Specifically, the trades dataset contains details of orders, including the order status, while the users dataset contains user information, including each user's city.

Filter the trades data to include only rows where Status is "Completed".
Join the filtered trades data with the users data on the user ID to associate each order with a city.
Aggregate the data by city and count the total number of completed orders for each city.
Output the results, showing the city and corresponding total number of completed orders in descending order by count.

In [0]:
from pyspark.sql.types import *
from datetime import datetime

# Define the schema for the DataFrame
schema = StructType([
    StructField("OrderID", IntegerType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("Status", StringType(), True),
    StructField("OrderDate", TimestampType(), True),
    StructField("Price", FloatType(), True)
])

# Define the data with datetime objects for the timestamps
data = [
    (100101, 111, 10, "Cancelled", datetime(2022, 8, 17, 12, 0, 0), 9.80),
    (100102, 111, 10, "Completed", datetime(2022, 8, 17, 12, 0, 0), 10.00),
    (100264, 148, 40, "Completed", datetime(2022, 8, 26, 12, 0, 0), 4.80),
    (100305, 300, 15, "Completed", datetime(2022, 9, 5, 12, 0, 0), 10.00),
    (100909, 488, 1, "Completed", datetime(2022, 7, 5, 12, 0, 0), 6.50),
    (100259, 148, 35, "Completed", datetime(2022, 8, 25, 12, 0, 0), 5.10),
    (100900, 148, 50, "Completed", datetime(2022, 7, 14, 12, 0, 0), 9.78),
    (101432, 265, 10, "Completed", datetime(2022, 8, 16, 12, 0, 0), 13.00),
    (102533, 488, 25, "Cancelled", datetime(2022, 11, 10, 12, 0, 0), 22.40),
    (100565, 265, 2, "Completed", datetime(2022, 9, 27, 12, 0, 0), 8.70),
    (100400, 178, 32, "Completed", datetime(2022, 9, 17, 12, 0, 0), 12.00),
    (100777, 178, 60, "Completed", datetime(2022, 7, 25, 17, 47, 0), 3.56)
]
# Define schema for the users data
users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("email", StringType(), True),
    StructField("signup_date", TimestampType(), True)
])

# Define the data
users_data = [
    (111, "San Francisco", "rrok10@gmail.com", datetime(2021, 8, 3, 12, 0, 0)),
    (148, "Boston", "sailor9820@gmail.com", datetime(2021, 8, 20, 12, 0, 0)),
    (178, "San Francisco", "harrypotterfan182@gmail.com", datetime(2022, 1, 5, 12, 0, 0)),
    (265, "Denver", "shadower_@hotmail.com", datetime(2022, 2, 26, 12, 0, 0)),
    (300, "San Francisco", "houstoncowboy1122@hotmail.com", datetime(2022, 6, 30, 12, 0, 0)),
    (488, "New York", "empire_state78@outlook.com", datetime(2022, 7, 3, 12, 0, 0))
]

# Create DataFrame
users_df = spark.createDataFrame(users_data, users_schema)

# display DataFrame
users_df.display()

user_id,city,email,signup_date
111,San Francisco,rrok10@gmail.com,2021-08-03T12:00:00.000+0000
148,Boston,sailor9820@gmail.com,2021-08-20T12:00:00.000+0000
178,San Francisco,harrypotterfan182@gmail.com,2022-01-05T12:00:00.000+0000
265,Denver,shadower_@hotmail.com,2022-02-26T12:00:00.000+0000
300,San Francisco,houstoncowboy1122@hotmail.com,2022-06-30T12:00:00.000+0000
488,New York,empire_state78@outlook.com,2022-07-03T12:00:00.000+0000


In [0]:
# Create the DataFrame
df = spark.createDataFrame(data, schema)

# display the DataFrame
df.display()

OrderID,CustomerID,ProductID,Status,OrderDate,Price
100101,111,10,Cancelled,2022-08-17T12:00:00.000+0000,9.8
100102,111,10,Completed,2022-08-17T12:00:00.000+0000,10.0
100264,148,40,Completed,2022-08-26T12:00:00.000+0000,4.8
100305,300,15,Completed,2022-09-05T12:00:00.000+0000,10.0
100909,488,1,Completed,2022-07-05T12:00:00.000+0000,6.5
100259,148,35,Completed,2022-08-25T12:00:00.000+0000,5.1
100900,148,50,Completed,2022-07-14T12:00:00.000+0000,9.78
101432,265,10,Completed,2022-08-16T12:00:00.000+0000,13.0
102533,488,25,Cancelled,2022-11-10T12:00:00.000+0000,22.4
100565,265,2,Completed,2022-09-27T12:00:00.000+0000,8.7


In [0]:
df.createOrReplaceTempView('trades')
users_df.createOrReplaceTempView('users')

In [0]:
%sql
WITH CompletedOrders AS (
  SELECT 
    users.city,
    COUNT(trades.orderid) AS total_orders
  FROM trades 
  INNER JOIN users 
    ON trades.CustomerID = users.user_id
  WHERE trades.status = 'Completed'
  GROUP BY users.city
)
SELECT *
FROM CompletedOrders
ORDER BY total_orders DESC
LIMIT 3;


city,total_orders
San Francisco,4
Boston,3
Denver,2


In [0]:
from pyspark.sql import functions as F

completed_trades = df.filter(df.Status == "Completed")

# Join trades and users on CustomerID and user_id
joined_df = users_df.join(df, df.CustomerID == users_df.user_id, "inner")

# Group by city and count the total orders
completed_orders = joined_df.groupBy("city").agg(F.count("orderid").alias("total_orders"))

# Order by total_orders in descending order and select top 3
top_cities = completed_orders.orderBy(F.desc("total_orders")).limit(3)

# Display the result
top_cities.display()

city,total_orders
San Francisco,5
Boston,3
New York,2


Explanation:

Filtering: completed_trades is filtered to include only rows where Status is 'Completed'.
Joining: The completed_trades DataFrame is joined with users_df on CustomerID (from trades_df) and user_id (from users_df).
Grouping and Aggregating: The joined data is grouped by city, and a count of OrderID is calculated as total_orders.
Displaying: The final output displays cities and their corresponding order counts, sorted in descending order of total_orders (optional).