In [0]:
# Partitioning DataFrames
import datetime
import pyspark.sql.functions as F
from pyspark.sql import Row
import pandas as pd
from pyspark.sql.types import *

In [0]:
df = spark.read.json('/public/retail_db_json/orders')

In [0]:
df.write

In [0]:
help(df.write.partitionBy)

In [0]:
help(df.write.json)

In [0]:
help(df.write.parquet)

In [0]:
orders = spark.read.json('/public/retail_db_json/orders')

In [0]:
orders.show()

In [0]:
import getpass
username = getpass.getuser()

In [0]:
orders \
    .withColumn('order_date', F.date_format('order_date', 'yyyyMMdd')) \
    .show()

In [0]:
dbutils.fs.rm(f'/user/{username}/retail_db/orders_partitioned_by_date', recurse=True)

In [0]:
orders \
    .withColumn('order_date', F.date_format('order_date', 'yyyyMMdd')) \
    .coalesce(1) \
    .write \
    .partitionBy('order_date') \
    .parquet(f'/user/{username}/retail_db/orders_partitioned_by_date')

In [0]:
dbutils.fs.ls(f'/user/{username}/retail_db/orders_partitioned_by_date')

In [0]:
orders.count()

In [0]:
spark.read.parquet(f'/user/{username}/retail_db/orders_partitioned_by_date').dtypes

In [0]:
spark.read.parquet(f'/user/{username}/retail_db/orders_partitioned_by_date').show()

In [0]:
orders \
    .withColumn('order_month', F.date_format('order_date', 'yyyyMM')) \
    .show()

In [0]:
dbutils.fs.rm(f'/user/{username}/retail_db/orders_partitioned_by_month', recurse=True)

In [0]:
orders \
    .withColumn('order_month', F.date_format('order_date', 'yyyyMM')) \
    .coalesce(1) \
    .write \
    .parquet(f'/user/{username}/retail_db/orders_partitioned_by_month', partitionBy='order_month')

In [0]:
dbutils.fs.ls(f'/user/{username}/retail_db/orders_partitioned_by_month')

In [0]:
dbutils.fs.rm(f'/user/{username}/retail_db/orders_partitioned', recurse=True)

In [0]:
orders \
    .withColumn('year', F.date_format('order_date', 'yyyy')) \
    .withColumn('month', F.date_format('order_date', 'MM')) \
    .withColumn('day_of_month', F.date_format('order_date', 'dd')) \
    .coalesce(1) \
    .write \
    .partitionBy('year', 'month', 'day_of_month') \
    .parquet(f'/user/{username}/retail_db/orders_partitioned')

In [0]:
dbutils.fs.ls(f'/user/{username}/retail_db/orders_partitioned/year=2014/month=01/day_of_month=09')

In [0]:
%fs ls dbfs:/databricks-datasets

In [0]:
%fs ls dbfs:/databricks-datasets/asa/airlines

In [0]:
dbutils.fs.rm(f'/user/{username}/asa/airlines', recurse=True)

In [0]:
spark.read.csv('dbfs:/databricks-datasets/asa/airlines', header=True) \
    .write \
    .partitionBy('Year') \
    .csv(f'/user/{username}/asa/airlines', header=True, mode='overwrite')

In [0]:
dbutils.fs.ls(f'/user/{username}/asa/airlines')

In [0]:
# Partition Tuning
spark.read.csv('dbfs:/databricks-datasets/asa/airlines', header=True).count()

In [0]:
spark.read.csv(f'/user/{username}/asa/airlines', header=True).count()

In [0]:
spark.read.csv('dbfs:/databricks-datasets/asa/airlines', header=True) \
    .filter('Year = 2004') \
    .count()

In [0]:
spark.read.csv(f'/user/{username}/asa/airlines/Year=2004', header=True).count()

In [0]:
dbutils.fs.ls(f'/user/{username}/asa/airlines')

In [0]:
# Predicate Pushing here!!! (aka Partition Tuning)
spark.read.csv(f'/user/{username}/asa/airlines', header=True) \
    .filter('Year = 2004') \
    .count()

In [0]:
airlines_df = spark.read.csv(f'/user/{username}/asa/airlines', header=True)

In [0]:
airlines_df.createOrReplaceTempView('airlines')

In [0]:
spark.sql("""
    SELECT count(*)
    FROM airlines
    WHERE year = 2004
""").show()