# Ex-2190 - left and right join


In [1]:
import requests

! curl -L -o categories.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/categories.csv
! curl -L -o products.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/products.csv
! curl -L -o customers.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/customers.csv
! curl -L -o employees.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/employees.csv
! curl -L -o orders.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/orders.csv
! curl -L -o order_details.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/order_details.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2448  100  2448    0     0   6746      0 --:--:-- --:--:-- --:--:--  6743
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4474  100  4474    0     0  10712      0 --:--:-- --:--:-- --:--:-- 10729
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11841  100 11841    0     0  28955      0 --:--:-- --:--:-- --:--:-- 28951
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6116  100  6116    0     0  32625      0 --:--:-- --:--:-- --:--:-- 32705
  % Total    % Received % Xferd  Average Speed   Tim

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df_categories = spark.read.option("header", "true").option("inferSchema", "true").csv("categories.csv")
df_products = spark.read.option("header", "true").option("inferSchema", "true").csv("products.csv")
df_customers = spark.read.option("header", "true").option("inferSchema", "true").csv("customers.csv")
df_employees = spark.read.option("header", "true").option("inferSchema", "true").csv("employees.csv")
df_orders = spark.read.option("header", "true").option("inferSchema", "true").csv("orders.csv")
df_order_details = spark.read.option("header", "true").option("inferSchema", "true").csv("order_details.csv")


In [4]:
# 3. Check the structure of the DataFrame
df_categories.printSchema()
df_products.printSchema()
df_customers.printSchema()
df_employees.printSchema()
df_orders.printSchema()
df_order_details.printSchema()

root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Picture: string (nullable = true)

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- SupplierID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- QuantityPerUnit: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitsInStock: integer (nullable = true)
 |-- UnitsOnOrder: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- Discontinued: integer (nullable = true)

root
 |-- CustomerID: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- ContactName: string (nullable = true)
 |-- ContactTitle: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phon

In [5]:
from pyspark.sql.functions import col

# Perform LEFT JOIN
unsold_products = df_products.join(df_order_details, "ProductID", "left")

# Filter products with no matching order details (OrderID is NULL)
unsold_products = unsold_products.filter(col("OrderID").isNull()) \
                                 .select("ProductID", "ProductName")

# Show results
unsold_products.show()

+---------+-----------+
|ProductID|ProductName|
+---------+-----------+
+---------+-----------+



In [6]:
# Perform LEFT JOIN
inactive_customers = df_customers.join(df_orders, "CustomerID", "left")

# Filter customers with no matching orders (OrderID is NULL)
inactive_customers = inactive_customers.filter(col("OrderID").isNull()) \
                                       .select("CustomerID", "CompanyName")

# Show results
inactive_customers.show()

+----------+--------------------+
|CustomerID|         CompanyName|
+----------+--------------------+
|     FISSA|FISSA Fabrica Int...|
|     PARIS|   Paris spécialités|
+----------+--------------------+



In [7]:
from pyspark.sql.functions import desc

# filter out only orders from 1998
df_orders_98 = df_orders.filter(col("OrderDate") >= "1998-01-01")
df_order_details_98 = df_order_details.join(df_orders_98, "OrderID", "inner")

# Perform LEFT JOIN
unsold_products_98 = df_products.join(df_order_details_98, "ProductID", "left")

# Filter products with no matching order details (OrderID is NULL)
unsold_products_98 = unsold_products_98.filter(col("OrderID").isNull()) \
                                 .select("ProductID", "ProductName")

# Show results
unsold_products_98.show()

+---------+------------+
|ProductID| ProductName|
+---------+------------+
|       15|Genen Shouyu|
+---------+------------+

