# Ex-2170 - partitions


In [1]:
import requests

! curl -L -o categories.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/categories.csv
! curl -L -o products.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/products.csv
! curl -L -o customers.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/customers.csv
! curl -L -o employees.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/employees.csv
! curl -L -o orders.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/orders.csv
! curl -L -o order_details.csv  https://raw.githubusercontent.com/raynaldmo/northwind-mongodb/refs/heads/master/collections/csv/order_details.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2448  100  2448    0     0   9210      0 --:--:-- --:--:-- --:--:--  9237
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4474  100  4474    0     0  17478      0 --:--:-- --:--:-- --:--:-- 17476
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11841  100 11841    0     0  53991      0 --:--:-- --:--:-- --:--:-- 54068
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6116  100  6116    0     0  27324      0 --:--:-- --:--:-- --:--:-- 27426
  % Total    % Received % Xferd  Average Speed   Tim

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df_categories = spark.read.option("header", "true").option("inferSchema", "true").csv("categories.csv")
df_products = spark.read.option("header", "true").option("inferSchema", "true").csv("products.csv")
df_customers = spark.read.option("header", "true").option("inferSchema", "true").csv("customers.csv")
df_employees = spark.read.option("header", "true").option("inferSchema", "true").csv("employees.csv")
df_orders = spark.read.option("header", "true").option("inferSchema", "true").csv("orders.csv")
df_order_details = spark.read.option("header", "true").option("inferSchema", "true").csv("order_details.csv")


In [4]:
# 3. Check the structure of the DataFrame
df_categories.printSchema()
df_products.printSchema()
df_customers.printSchema()
df_employees.printSchema()
df_orders.printSchema()
df_order_details.printSchema()

root
 |-- CategoryID: integer (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Picture: string (nullable = true)

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- SupplierID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- QuantityPerUnit: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitsInStock: integer (nullable = true)
 |-- UnitsOnOrder: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- Discontinued: integer (nullable = true)

root
 |-- CustomerID: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- ContactName: string (nullable = true)
 |-- ContactTitle: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phon

In [5]:
tables = [df_categories, df_products, df_customers, df_employees, df_orders, df_order_details]

for table_name, df in zip(["df_categories", "df_products", "df_customers", "df_employees", "df_orders", "df_order_details"], tables):
    print(f"Table {table_name} has {df.rdd.getNumPartitions()} partitions")

Table df_categories has 1 partitions
Table df_products has 1 partitions
Table df_customers has 1 partitions
Table df_employees has 1 partitions
Table df_orders has 1 partitions
Table df_order_details has 1 partitions


In [6]:
for table_name, df in zip(["df_categories", "df_products", "df_customers", "df_employees", "df_orders", "df_order_details"], tables):
    new_partition_count = df.rdd.getNumPartitions() * 2
    df = df.repartition(new_partition_count)
    print(f"{table_name} repartitioned to {new_partition_count} partitions")

df_categories repartitioned to 2 partitions
df_products repartitioned to 2 partitions
df_customers repartitioned to 2 partitions
df_employees repartitioned to 2 partitions
df_orders repartitioned to 2 partitions
df_order_details repartitioned to 2 partitions


In [7]:
original_partition_counts = {df: df.rdd.getNumPartitions() for df in tables}

for table_name, df in zip(["df_categories", "df_products", "df_customers", "df_employees", "df_orders", "df_order_details"], tables):
    df = df.coalesce(1)
    print(f"{table_name} reduced back to minimum")

df_categories reduced back to minimum
df_products reduced back to minimum
df_customers reduced back to minimum
df_employees reduced back to minimum
df_orders reduced back to minimum
df_order_details reduced back to minimum
