In [0]:
# 1. Define the paths for your data
volume_path = "/Volumes/catalog1/instacart_db/raw_data/"
schema_name = "catalog1.instacart_db"

# 2. List of the 4 files you currently have uploaded
files_to_process = ["products", "aisles", "departments", "order_products__train"]

# 3. Loop through each file to read and save it
for file in files_to_process:
    print(f"Now processing: {file}...")
    
    # Read the CSV
    # header=true: uses the first row for column names
    # inferSchema=true: automatically detects data types (integers, strings, etc.)
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(f"{volume_path}{file}.csv")
    
    # Save as a Delta Table
    # mode("overwrite"): replaces the table if it already exists
    df.write.format("delta").mode("overwrite").saveAsTable(f"{schema_name}.{file}")
    
    print(f"Success! Table '{file}' is ready.")

Now processing: products...
Success! Table 'products' is ready.
Now processing: aisles...
Success! Table 'aisles' is ready.
Now processing: departments...
Success! Table 'departments' is ready.
Now processing: order_products__train...
Success! Table 'order_products__train' is ready.


In [0]:
%sql
-- Check the products table
SELECT * FROM catalog1.instacart_db.order_products__train LIMIT 5;

order_id,product_id,add_to_cart_order,reordered
1,49302,1,1
1,11109,2,1
1,10246,3,0
1,49683,4,0
1,43633,5,1


In [0]:
# 1. Setup paths
volume_path = "/Volumes/catalog1/instacart_db/raw_data/"
schema_name = "catalog1.instacart_db"

# 2. Identify the final 2 files
final_files = ["orders", "order_products__prior"]

# 3. Process each file
for file in final_files:
    print(f"Starting to process: {file}. Please wait, these are large files...")
    
    # Read the CSV from the Volume
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(f"{volume_path}{file}.csv")
    
    # Save as a permanent Delta Table
    df.write.format("delta").mode("overwrite").saveAsTable(f"{schema_name}.{file}")
    
    print(f"Finished! Table '{file}' is now ready in your catalog.")

Starting to process: orders. Please wait, these are large files...
Finished! Table 'orders' is now ready in your catalog.
Starting to process: order_products__prior. Please wait, these are large files...
Finished! Table 'order_products__prior' is now ready in your catalog.


In [0]:
# List of all your tables
all_tables = ["products", "aisles", "departments", "order_products__train", "orders", "order_products__prior"]

print(f"{'Table Name':<25} | {'Row Count':<15}")
print("-" * 45)

for table in all_tables:
    # Use spark.table to grab the table and .count() to get the total rows
    count = spark.table(f"catalog1.instacart_db.{table}").count()
    print(f"{table:<25} | {count:<15,}")

Table Name                | Row Count      
---------------------------------------------
products                  | 49,688         
aisles                    | 134            
departments               | 21             
order_products__train     | 1,384,617      
orders                    | 3,421,083      
order_products__prior     | 32,434,489     


In [0]:
from pyspark.sql.functions import col, count, when

# We will check the 'orders' table specifically as it's the most important
orders_df = spark.table("catalog1.instacart_db.orders")

# This code counts how many NULL values are in each column
null_counts = orders_df.select([count(when(col(c).isNull(), c)).alias(c) for c in orders_df.columns])

print("Null value counts in Orders table:")
null_counts.show()

Null value counts in Orders table:
+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
|       0|      0|       0|           0|        0|                0|                206209|
+--------+-------+--------+------------+---------+-----------------+----------------------+



In [0]:
from pyspark.sql.functions import coalesce, lit

# 1. Update the dataframe to replace NULLs with 0
# coalesce returns the first non-null value it finds
orders_clean_df = orders_df.withColumn(
    "days_since_prior_order", 
    coalesce(col("days_since_prior_order"), lit(0))
)

# 2. Overwrite the permanent table with our clean version
orders_clean_df.write.format("delta").mode("overwrite").saveAsTable("catalog1.instacart_db.orders")

print("Success! The 'orders' table is now clean and null-free.")

Success! The 'orders' table is now clean and null-free.


In [0]:
from pyspark.sql.functions import col, count, when

# We will check the 'orders' table specifically as it's the most important
orders_df = spark.table("catalog1.instacart_db.orders")

# This code counts how many NULL values are in each column
null_counts = orders_df.select([count(when(col(c).isNull(), c)).alias(c) for c in orders_df.columns])

print("Null value counts in Orders table:")
null_counts.show()

Null value counts in Orders table:
+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
|       0|      0|       0|           0|        0|                0|                     0|
+--------+-------+--------+------------+---------+-----------------+----------------------+

