# Ex-2180 - inner join


In [1]:
import requests

! curl -L -o categories.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/categories.csv
! curl -L -o products.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/products.csv
! curl -L -o customers.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/customers.csv
! curl -L -o employees.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/employees.csv
! curl -L -o orders.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/orders.csv
! curl -L -o order_details.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/order_details.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2448  100  2448    0     0   6081      0 --:--:-- --:--:-- --:--:--  6074
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4474  100  4474    0     0  13094      0 --:--:-- --:--:-- --:--:-- 13120
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11841  100 11841    0     0  52010      0 --:--:-- --:--:-- --:--:-- 52162
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6116  100  6116    0     0  18262      0 --:--:-- --:--:-- --:--:-- 18311
  % Total    % Received % Xferd  Average Speed   Tim

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df_categories = spark.read.option("header", "true").option("inferSchema", "true").csv("categories.csv")
df_products = spark.read.option("header", "true").option("inferSchema", "true").csv("products.csv")
df_customers = spark.read.option("header", "true").option("inferSchema", "true").csv("customers.csv")
df_employees = spark.read.option("header", "true").option("inferSchema", "true").csv("employees.csv")
df_orders = spark.read.option("header", "true").option("inferSchema", "true").csv("orders.csv")
df_order_details = spark.read.option("header", "true").option("inferSchema", "true").csv("order_details.csv")


In [4]:
# 3. Check the structure of the DataFrame
df_categories.printSchema()
df_products.printSchema()
df_customers.printSchema()
df_employees.printSchema()
df_orders.printSchema()
df_order_details.printSchema()

root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Picture: string (nullable = true)

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- SupplierID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- QuantityPerUnit: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitsInStock: integer (nullable = true)
 |-- UnitsOnOrder: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- Discontinued: integer (nullable = true)

root
 |-- CustomerID: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- ContactName: string (nullable = true)
 |-- ContactTitle: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phon

In [5]:
cat_prod_df = df_categories.join(df_products, "CategoryID", "inner")

In [6]:
from pyspark.sql.functions import min, max

cat_prod_df.groupBy("CategoryName").agg(
    min("UnitPrice").alias("Min UnitPrice"),
    max("UnitPrice").alias("Max UnitPrice")
).show()

+--------------+-------------+-------------+
|  CategoryName|Min UnitPrice|Max UnitPrice|
+--------------+-------------+-------------+
|Dairy Products|          2.5|         55.0|
|  Meat/Poultry|         7.45|       123.79|
|    Condiments|         10.0|         43.9|
|     Beverages|          4.5|        263.5|
|Grains/Cereals|          7.0|         38.0|
|       Seafood|          6.0|         62.5|
|   Confections|          9.2|         81.0|
|       Produce|         10.0|         53.0|
+--------------+-------------+-------------+



In [7]:
order_all_df = df_orders.join(df_order_details, "OrderID", "inner")

In [8]:
order_all_emp_cust_df = order_all_df.join(df_employees, "EmployeeID", "inner") \
                                   .join(df_customers, "CustomerID", "inner")

In [9]:
from pyspark.sql.functions import min, max

order_all_emp_cust_df.groupBy("CustomerID", "CompanyName") \
                     .agg(
                         min("OrderDate").alias("Earliest Order"),
                         max("OrderDate").alias("Latest Order")
                     ).show()

+----------+--------------------+-------------------+-------------------+
|CustomerID|         CompanyName|     Earliest Order|       Latest Order|
+----------+--------------------+-------------------+-------------------+
|     CENTC|Centro comercial ...|1996-07-18 00:00:00|1996-07-18 00:00:00|
|     COMMI|    Comércio Mineiro|1996-08-27 00:00:00|1998-04-22 00:00:00|
|     OCEAN|Océano Atlántico ...|1997-01-09 00:00:00|1998-03-30 00:00:00|
|     ANATR|Ana Trujillo Empa...|1996-09-18 00:00:00|1998-03-04 00:00:00|
|     LACOR|La corne d'abondance|1998-01-29 00:00:00|1998-03-24 00:00:00|
|     ERNSH|        Ernst Handel|1996-07-17 00:00:00|1998-05-05 00:00:00|
|     FRANS|      Franchi S.p.A.|1997-01-22 00:00:00|1998-04-30 00:00:00|
|     GROSR|GROSELLA-Restaurante|1996-07-30 00:00:00|1997-12-18 00:00:00|
|     TOMSP|  Toms Spezialitäten|1996-07-05 00:00:00|1998-03-23 00:00:00|
|     QUEDE|         Que Delícia|1996-07-19 00:00:00|1998-03-31 00:00:00|
|     LILAS|   LILA-Supermercado|1996-

In [10]:
from pyspark.sql.functions import sum, col

order_all_emp_cust_df\
    .groupBy("FirstName", "LastName")\
    .agg(
     sum(col("Quantity") * col("UnitPrice")).alias("Total Sales")
).show()

+---------+---------+------------------+
|FirstName| LastName|       Total Sales|
+---------+---------+------------------+
| Margaret|  Peacock|250187.44999999992|
|    Nancy|  Davolio|202143.71000000002|
|   Andrew|   Fuller|177749.26000000004|
|    Laura| Callahan|133301.02999999997|
|    Janet|Leverling|213051.29999999996|
|   Robert|     King|         141295.99|
|  Michael|   Suyama| 78198.09999999999|
|   Steven| Buchanan| 75567.75000000001|
|     Anne|Dodsworth| 82963.99999999999|
+---------+---------+------------------+

