# Sakila
Building off of the midterm project, the data is sourced from the sakila_2 database. It has the addition of a new, fourth dimension table for store. The store dimension table for sakila_2 was created with SQL scripts in MySQLWorkbench. The tables from sakila_2 were exported from MySQL Workbench to be used in building the sakila data lakehouse. The date dimension table tale and the film dimension table were accessed as reference data from the Azure MySQL Server sakila_2 database. The store and customer dimension tables were place in the DBFS file system. The store .json data was uploaded to a MongoDB collection from the DBFS file system. It was read in from MongoDB to create the table for sakila_dlh. The customer data was in a .csv file and was read in from DBFS using PySpark. The fact_rental data was used as hot-path data and had been exported from MySQL Workbench in .json format, spread accross multiple files to simulate this. It was processed as bronze, then integrated with the cold path reference data to create the silver tables. Finally, two gold table aggregations were performed to assess the sakila DVD rental business. I made visualizations for the gold tables which are in the folder.

#### Import Required Libraries

In [0]:
import os
import json
import pymongo
import pyspark.pandas as pd  # This uses Koalas that is included in PySpark version 3.2 or newer.
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, DecimalType

#### Instantiate Global Variables

In [0]:
# Azure MySQL Server Connection Information ###################
jdbc_hostname = "ran5tcw-mysql.mysql.database.azure.com"
jdbc_port = 3306
src_database = "sakila_2"

connection_properties = {
  "user" : "ran5tcw",
  "password" : "Mypassword123",
  "driver" : "org.mariadb.jdbc.Driver"
}

# MongoDB Atlas Connection Information ########################
atlas_cluster_name = "ds2002.xmxvni4"
atlas_database_name = "sakila_2"
atlas_user_name = "rachel1"
atlas_password = "mypassword123"

# Data Files (JSON) Information ###############################
dst_database = "sakila_dlh"

base_dir = "dbfs:/FileStore/ds2002-final"
database_dir = f"{base_dir}/{dst_database}"

data_dir = f"{base_dir}/source_data"
batch_dir = f"{data_dir}/batch"
stream_dir = f"{data_dir}/stream"

orders_stream_dir = f"{stream_dir}/rentals"

rentals_output_bronze = f"{database_dir}/fact_rentals/bronze"
rentals_output_silver = f"{database_dir}/fact_rentals/silver"
rentals_output_gold   = f"{database_dir}/fact_rentals/gold"

# Delete the Streaming Files  
dbutils.fs.rm(f"{database_dir}/fact_rentals", True) 

# Delete the Database Files 
dbutils.fs.rm(database_dir, True)

Out[88]: True

#### Define Global Functions

In [0]:

# Fetch DataFrame from the MongoDB Atlas database server using PyMongo.
def get_mongo_dataframe(user_id, pwd, cluster_name, db_name, collection, conditions, projection, sort):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    
    client = pymongo.MongoClient(mongo_uri)

    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    if conditions and projection and sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection).sort(sort)))
    elif conditions and projection and not sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection)))
    else:
        dframe = pd.DataFrame(list(db[collection].find()))

    client.close()
    
    return dframe


# Create New Collections by Uploading JSON file(s) to the MongoDB Atlas server.
def set_mongo_collection(user_id, pwd, cluster_name, db_name, src_file_path, json_files):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    '''Read in a JSON file, and Use It to Create a New Collection'''
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(src_file_path, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)

    client.close()
    
    return result

### Populate Dimensions by Ingesting Reference (Cold-path) Data

#### Fetch Reference Data From My Azure MySQL Database - Sakila_2

In [0]:
%sql
DROP DATABASE IF EXISTS sakila_dlh CASCADE;

CREATE DATABASE IF NOT EXISTS sakila_dlh
COMMENT "DS-2002 Lab 06 Database"
LOCATION "dbfs:/FileStore/ds2002-final/sakila_dlh"
WITH DBPROPERTIES (contains_pii = true, purpose = "DS-2002 Final Project");

##### Create a New Dimension Table that Sources Date Dimension Data from a Table in an Azure MySQL database (sakila_2).

In [0]:
%sql
--temp view for date dim
--extracts data from MySQL sakila database.

CREATE OR REPLACE TEMPORARY VIEW view_date
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://ran5tcw-mysql.mysql.database.azure.com:3306/sakila_2",
  dbtable "dim_date",
  user "ran5tcw",
  password "Mypassword123"
)

In [0]:
%sql
--create dim_date in sakila datalakehouse
USE DATABASE sakila_dlh;

CREATE OR REPLACE TABLE sakila_dlh.dim_date
COMMENT "Date Dimension Table"
LOCATION "dbfs:/FileStore/ds2002-final/sakila_dlh/dim_date"
AS SELECT * FROM view_date

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_date LIMIT 5

date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,week_of_year,month_name,month_of_year,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
20050501,2005-05-01,2005/05/01,05/01/2005,01/05/2005,1,Sunday,1,121,Weekend,17,May,5,N,2,2005,2005-05,2005Q2,11,4,2005,2005-11,2005Q4
20050502,2005-05-02,2005/05/02,05/02/2005,02/05/2005,2,Monday,2,122,Weekday,18,May,5,N,2,2005,2005-05,2005Q2,11,4,2005,2005-11,2005Q4
20050503,2005-05-03,2005/05/03,05/03/2005,03/05/2005,3,Tuesday,3,123,Weekday,18,May,5,N,2,2005,2005-05,2005Q2,11,4,2005,2005-11,2005Q4
20050504,2005-05-04,2005/05/04,05/04/2005,04/05/2005,4,Wednesday,4,124,Weekday,18,May,5,N,2,2005,2005-05,2005Q2,11,4,2005,2005-11,2005Q4
20050505,2005-05-05,2005/05/05,05/05/2005,05/05/2005,5,Thursday,5,125,Weekday,18,May,5,N,2,2005,2005-05,2005Q2,11,4,2005,2005-11,2005Q4


##### Create a New Table that Sources Film  Dimension Data from the Azure MySQL sakila_2 database.

In [0]:
%sql
-- Creates a Temporary View named "view_film" that extracts data from MySQL sakila database.
create or replace temporary view view_film
using org.apache.spark.sql.jdbc
options(
  url "jdbc:mysql://ran5tcw-mysql.mysql.database.azure.com:3306/sakila_2", 
  dbtable "dim_film",
  user "ran5tcw",
  password "Mypassword123"
)

In [0]:
%sql
USE DATABASE sakila_dlh;

-- Create a new table "sakila_dlh.dim_film" using data from "view_film"
create or replace table sakila_dlh.dim_film
comment "film dimension table"
LOCATION "dbfs:/FileStore/ds2002-final/sakila_dlh/dim_film"
as select * from view_film;

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_film;

film_key,title,description,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update,film_language
1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42,English
2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China,2006,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English
3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory,2006,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English
4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank,2006,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42,English
5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And a Dentist who must Pursue a Forensic Psychologist in The Gulf of Mexico,2006,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42,English
6,AGENT TRUMAN,A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China,2006,3,2.99,169,17.99,PG,Deleted Scenes,2006-02-15 05:03:42,English
7,AIRPLANE SIERRA,A Touching Saga of a Hunter And a Butler who must Discover a Butler in A Jet Boat,2006,6,4.99,62,28.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English
8,AIRPORT POLLOCK,A Epic Tale of a Moose And a Girl who must Confront a Monkey in Ancient India,2006,6,4.99,54,15.99,R,Trailers,2006-02-15 05:03:42,English
9,ALABAMA DEVIL,A Thoughtful Panorama of a Database Administrator And a Mad Scientist who must Outgun a Mad Scientist in A Jet Boat,2006,3,2.99,114,21.99,PG-13,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English
10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack who must Reach a Feminist in Ancient China,2006,6,4.99,63,24.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English


#### Fetch Reference Data from a MongoDB Atlas Database

In [0]:
#check that files exist in the right place
display(dbutils.fs.ls(batch_dir)) 

path,name,size,modificationTime
dbfs:/FileStore/ds2002-final/source_data/batch/sakila_DimCustomers.csv,sakila_DimCustomers.csv,100540,1683831042000
dbfs:/FileStore/ds2002-final/source_data/batch/sakila_DimFilm.csv,sakila_DimFilm.csv,204402,1683831041000
dbfs:/FileStore/ds2002-final/source_data/batch/sakila_DimStore.json,sakila_DimStore.json,472,1683831041000


##### Create a New MongoDB Database and Load JSON Data for Store Into a New MongoDB Collection

In [0]:
#connect from dbfs to mongodb
source_dir = '/dbfs/FileStore/ds2002-final/source_data/batch'
json_files = {"store" : 'sakila_DimStore.json'}
#the store dimension data
set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, atlas_database_name, source_dir, json_files) 

Out[101]: <pymongo.results.InsertManyResult at 0x7f70280d5600>

##### Fetch Store Dimension Data from the New MongoDB Collection

In [0]:
%scala
import com.mongodb.spark._
val df_store = spark.read.format("com.mongodb.spark.sql.DefaultSource")
.option("database", "sakila_2").option("collection", "store").option("uri", f"mongodb+srv://rachel1:mypassword123@ds2002.xmxvni4.mongodb.net/sakila_2").load()
.select("store_id","last_update","address","district","postal_code","phone","city","country")
display(df_store)

store_id,last_update,address,district,postal_code,phone,city,country
1,2006-02-15 04:57:12,47 MySakila Drive,Alberta,,,Lethbridge,Canada
2,2006-02-15 04:57:12,28 MySQL Boulevard,QLD,,,Woodridge,Australia


In [0]:
%scala
df_store.printSchema()

##### Use the Spark DataFrame to Create a New Store Dimension Table in sakila_dlh

In [0]:
%scala
df_store.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_store")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_store

col_name,data_type,comment
store_id,int,
last_update,string,
address,string,
district,string,
postal_code,string,
phone,string,
city,string,
country,string,
,,
# Detailed Table Information,,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_store;

store_id,last_update,address,district,postal_code,phone,city,country
1,2006-02-15 04:57:12,47 MySakila Drive,Alberta,,,Lethbridge,Canada
2,2006-02-15 04:57:12,28 MySQL Boulevard,QLD,,,Woodridge,Australia


#### Fetch Data from Databricks File System

##### Use PySpark to Read From a CSV File to get Customer Information

In [0]:
customer_csv = f"{batch_dir}/sakila_DimCustomers.csv"

df_customer = spark.read.format('csv').options(header='true', inferSchema='true').load(customer_csv)
display(df_customer)

customer_key,first_name,last_name,email,active,create_date,last_update,address,district,postal_code,phone,city,country
1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1913 Hanoi Way,Nagasaki,35200,28303384290,Sasebo,Japan
2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1121 Loja Avenue,California,17886,838635286649,San Bernardino,United States
3,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,692 Joliet Street,Attika,83579,448477190408,Athenai,Greece
4,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1566 Inegl Manor,Mandalay,53561,705814003527,Myingyan,Myanmar
5,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,53 Idfu Parkway,Nantou,42399,10655648674,Nantou,Taiwan
6,JENNIFER,DAVIS,JENNIFER.DAVIS@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1795 Santiago de Compostela Way,Texas,18743,860452626434,Laredo,United States
7,MARIA,MILLER,MARIA.MILLER@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,900 Santiago de Compostela Parkway,Central Serbia,93896,716571220373,Kragujevac,Yugoslavia
8,SUSAN,WILSON,SUSAN.WILSON@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,478 Joliet Way,Hamilton,77948,657282285970,Hamilton,New Zealand
9,MARGARET,MOORE,MARGARET.MOORE@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,613 Korolev Drive,Masqat,45844,380657522649,Masqat,Oman
10,DOROTHY,TAYLOR,DOROTHY.TAYLOR@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1531 Sal Drive,Esfahan,53628,648856936185,Esfahan,Iran


In [0]:
df_customer.printSchema()

root
 |-- customer_key: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- active: integer (nullable = true)
 |-- create_date: timestamp (nullable = true)
 |-- last_update: timestamp (nullable = true)
 |-- address: string (nullable = true)
 |-- district: string (nullable = true)
 |-- postal_code: integer (nullable = true)
 |-- phone: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [0]:
df_customer.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_customer")
#write customer df to sakila data lakehouse

In [0]:
%sql
SELECT * FROM sakila_dlh.dim_customer LIMIT 5;

customer_key,first_name,last_name,email,active,create_date,last_update,address,district,postal_code,phone,city,country
1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1913 Hanoi Way,Nagasaki,35200,28303384290,Sasebo,Japan
2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1121 Loja Avenue,California,17886,838635286649,San Bernardino,United States
3,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,692 Joliet Street,Attika,83579,448477190408,Athenai,Greece
4,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,1566 Inegl Manor,Mandalay,53561,705814003527,Myingyan,Myanmar
5,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,1,2006-02-14T22:04:36.000+0000,2006-02-15T04:57:20.000+0000,53 Idfu Parkway,Nantou,42399,10655648674,Nantou,Taiwan


##### Verify Dimension Tables

In [0]:
%sql
USE sakila_dlh;
SHOW TABLES

database,tableName,isTemporary
sakila_dlh,dim_customer,False
sakila_dlh,dim_date,False
sakila_dlh,dim_film,False
sakila_dlh,dim_store,False
,display_query_14,True
,display_query_15,True
,fact_rentals_silver_tempview,True
,rentals_bronze_tempview,True
,rentals_raw_tempview,True
,rentals_silver_tempview,True


### Integrate Reference Data with Real-Time Data

#### Use AutoLoader to Process Streaming (Hot Path) Rentals Fact Data

##### Bronze Table: Process 'Raw' JSON Rentals Data

In [0]:
(spark.readStream
 .format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaHints", "fact_rental_key BIGINT")
 .option("cloudFiles.schemaHints", "rental_key BIGINT")
 .option("cloudFiles.schemaHints", "customer_key  BIGINT")
 .option("cloudFiles.schemaHints", "store_key BIGINT") 
 .option("cloudFiles.schemaHints", "film_key BIGINT")
 .option("cloudFiles.schemaHints", "amount DECIMAL")
 .option("cloudFiles.schemaHints", "rental_date_key DECIMAL")
 .option("cloudFiles.schemaHints", "return_date_key DECIMAL")
 .option("cloudFiles.schemaHints", "payment_date_key DECIMAL") 
 .option("cloudFiles.schemaHints", "last_update_key DECIMAL")
 .option("cloudFiles.schemaLocation", rentals_output_bronze)
 .option("cloudFiles.inferColumnTypes", "true")
 .option("multiLine", "true")
 .load(orders_stream_dir)
 .createOrReplaceTempView("rentals_raw_tempview"))

In [0]:
%sql
--add metadata for the bronze
CREATE OR REPLACE TEMPORARY VIEW rentals_bronze_tempview AS (
  SELECT *, current_timestamp() receipt_time, input_file_name() source_file
  FROM rentals_raw_tempview
)

In [0]:
%sql
SELECT * FROM rentals_bronze_tempview

amount,customer_key,fact_rental_key,film_key,last_update_key,payment_date_key,rental_date_key,rental_key,return_date_key,store_key,_rescued_data,receipt_time,source_file
6.99,468,101,134,20060215,20050525,20050525,101,20050531,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
3.99,343,102,82,20060215,20050525,20050525,102,20050531,1,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
4.99,384,103,735,20060215,20050525,20050525,103,20050603,1,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
0.99,310,104,932,20060215,20050525,20050525,104,20050527,1,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
4.99,108,105,173,20060215,20050525,20050525,105,20050530,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
11.99,196,106,791,20060215,20050525,20050525,106,20050604,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
6.99,317,107,621,20060215,20050525,20050525,107,20050603,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
2.99,242,108,724,20060215,20050525,20050525,108,20050530,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
1.99,503,109,233,20060215,20050525,20050525,109,20050529,1,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json
9.99,19,110,893,20060215,20050525,20050525,110,20050603,2,,2023-05-12T16:04:54.547+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental03.json


In [0]:
(spark.table("rentals_bronze_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rentals_output_bronze}/_checkpoint")
      .outputMode("append")
      .table("fact_rentals_bronze"))

Out[114]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f701f153e50>

##### Silver Table: Include Reference Data

In [0]:
(spark.readStream
  .table("fact_rentals_bronze")
  .createOrReplaceTempView("rentals_silver_tempview"))

In [0]:
%sql
SELECT * FROM rentals_silver_tempview

amount,customer_key,fact_rental_key,film_key,last_update_key,payment_date_key,rental_date_key,rental_key,return_date_key,store_key,_rescued_data,receipt_time,source_file
2.99,130,1,80,20060215,20050524,20050524,1,20050526,1,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
2.99,459,2,333,20060215,20050524,20050524,2,20050528,2,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
3.99,408,3,373,20060215,20050524,20050524,3,20050601,2,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
4.99,333,4,535,20060215,20050524,20050524,4,20050603,1,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
6.99,222,5,450,20060215,20050524,20050524,5,20050602,2,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
0.99,549,6,613,20060215,20050524,20050524,6,20050527,1,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
1.99,269,7,870,20060215,20050524,20050524,7,20050529,2,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
4.99,239,8,510,20060215,20050524,20050524,8,20050527,1,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
4.99,126,9,565,20060215,20050525,20050525,9,20050528,1,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json
5.99,399,10,396,20060215,20050525,20050525,10,20050531,2,,2023-05-12T16:07:26.921+0000,dbfs:/FileStore/ds2002-final/source_data/stream/rentals/sakila_rental01.json


In [0]:
%sql
DESCRIBE EXTENDED rentals_silver_tempview

col_name,data_type,comment
amount,double,
customer_key,bigint,
fact_rental_key,bigint,
film_key,bigint,
last_update_key,"decimal(10,0)",
payment_date_key,bigint,
rental_date_key,bigint,
rental_key,bigint,
return_date_key,bigint,
store_key,bigint,


In [0]:
%sql
--constructing the fact_rentals_silver view
--combine in data from the dimension tables that will be useful for analysis
CREATE OR REPLACE TEMPORARY VIEW fact_rentals_silver_tempview AS (
  SELECT o.fact_rental_key,
  o.rental_key,
  o.customer_key,
  c.last_name AS customer_last_name,
  c.first_name AS customer_first_name,
  c.city as customer_city,
  c.country as customer_country,
  o.store_key,
  s.store_id as store_num,
  s.city as store_city,
  s.country as store_country,
  o.film_key,
  f.title as film_title,
  f.description as film_description,
  f.release_year as film_year,
  f.film_language ,
  f.rating as film_rating,
  o.amount,
  o.rental_date_key,
  rd.day_name_of_week AS rental_day_name_of_week,
  rd.day_of_month AS rental_day_of_month,
  rd.weekday_weekend AS rental_weekday_weekend,
  rd.month_name AS rental_month_name,
  rd.calendar_quarter AS rental_quarter,
  rd.calendar_year AS rental_year,
  o.return_date_key,
  rrd.day_name_of_week AS return_day_name_of_week,
  rrd.day_of_month AS return_day_of_month,
  rrd.weekday_weekend AS return_weekday_weekend,
  rrd.month_name AS return_month_name,
  rrd.calendar_quarter AS return_quarter,
  rrd.calendar_year AS return_year,
  o.payment_date_key,
  pd.day_name_of_week AS payment_day_name_of_week,
  pd.day_of_month AS payment_day_of_month,
  pd.weekday_weekend AS payment_weekday_weekend,
  pd.month_name AS payment_month_name,
  pd.calendar_quarter AS payment_quarter,
  pd.calendar_year AS payment_year,
  o.last_update_key,
  ld.day_name_of_week AS last_update_day_name_of_week,
  ld.day_of_month AS last_update_day_of_month,
  ld.weekday_weekend AS last_update_weekday_weekend,
  ld.month_name AS last_update_month_name,
  ld.calendar_quarter AS last_update_quarter,
  ld.calendar_year AS last_update_year
  from rentals_silver_tempview as o
  inner join sakila_dlh.dim_customer as c
  on c.customer_key=o.customer_key
  inner join sakila_dlh.dim_store as s
  on s.store_id=o.store_key
  inner join sakila_dlh.dim_film as f
  on f.film_key=o.film_key
  LEFT OUTER JOIN sakila_dlh.dim_date AS rd
  ON rd.date_key = o.rental_date_key
  LEFT OUTER JOIN sakila_dlh.dim_date AS rrd
  ON rrd.date_key = o.return_date_key
  LEFT OUTER JOIN sakila_dlh.dim_date AS pd
  ON pd.date_key = o.payment_date_key
  LEFT OUTER JOIN sakila_dlh.dim_date AS ld
  ON ld.date_key = o.last_update_key
)

In [0]:
(spark.table("fact_rentals_silver_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rentals_output_silver}/_checkpoint")
      .outputMode("append")
      .table("fact_rentals_silver"))

Out[119]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f701f153760>

In [0]:
%sql
SELECT * FROM fact_rentals_silver

fact_rental_key,rental_key,customer_key,customer_last_name,customer_first_name,customer_city,customer_country,store_key,store_num,store_city,store_country,film_key,film_title,film_description,film_year,film_language,film_rating,amount,rental_date_key,rental_day_name_of_week,rental_day_of_month,rental_weekday_weekend,rental_month_name,rental_quarter,rental_year,return_date_key,return_day_name_of_week,return_day_of_month,return_weekday_weekend,return_month_name,return_quarter,return_year,payment_date_key,payment_day_name_of_week,payment_day_of_month,payment_weekday_weekend,payment_month_name,payment_quarter,payment_year,last_update_key,last_update_day_name_of_week,last_update_day_of_month,last_update_weekday_weekend,last_update_month_name,last_update_quarter,last_update_year
76,76,1,SMITH,MARY,Sasebo,Japan,2,2,Woodridge,Australia,663,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot who must Confront a Lumberjack in Soviet Georgia,2006,English,NC-17,2.99,20050525,Wednesday,25,Weekday,May,2,2005,20050603,Friday,3,Weekday,June,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
57,57,6,DAVIS,JENNIFER,Laredo,United States,2,2,Woodridge,Australia,858,SUBMARINE BED,A Amazing Display of a Car And a Monkey who must Fight a Teacher in Soviet Georgia,2006,English,R,4.99,20050525,Wednesday,25,Weekday,May,2,2005,20050529,Sunday,29,Weekend,May,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
117,117,7,MILLER,MARIA,Kragujevac,Yugoslavia,2,2,Woodridge,Australia,931,VALENTINE VANISHING,A Thrilling Display of a Husband And a Butler who must Reach a Pastry Chef in California,2006,English,PG-13,0.99,20050525,Wednesday,25,Weekday,May,2,2005,20050531,Tuesday,31,Weekday,May,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
46,46,7,MILLER,MARIA,Kragujevac,Yugoslavia,1,1,Lethbridge,Canada,730,RIDGEMONT SUBMARINE,A Unbelieveable Drama of a Waitress And a Composer who must Sink a Mad Cow in Ancient Japan,2006,English,PG-13,5.99,20050525,Wednesday,25,Weekday,May,2,2005,20050602,Thursday,2,Weekday,June,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
116,116,18,GARCIA,CAROL,Kaduna,Nigeria,1,1,Lethbridge,Canada,973,WIFE TURN,A Awe-Inspiring Epistle of a Teacher And a Feminist who must Confront a Pioneer in Ancient Japan,2006,English,NC-17,4.99,20050525,Wednesday,25,Weekday,May,2,2005,20050526,Thursday,26,Weekday,May,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
50,50,18,GARCIA,CAROL,Kaduna,Nigeria,2,2,Woodridge,Australia,432,HOPE TOOTSIE,A Amazing Documentary of a Student And a Sumo Wrestler who must Outgun a A Shark in A Shark Tank,2006,English,NC-17,2.99,20050525,Wednesday,25,Weekday,May,2,2005,20050528,Saturday,28,Weekend,May,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
110,110,19,MARTINEZ,RUTH,Kimberley,South Africa,2,2,Woodridge,Australia,893,TITANS JERK,A Unbelieveable Panorama of a Feminist And a Sumo Wrestler who must Challenge a Technical Writer in Ancient China,2006,English,PG,9.99,20050525,Wednesday,25,Weekday,May,2,2005,20050603,Friday,3,Weekday,June,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
18,18,19,MARTINEZ,RUTH,Kimberley,South Africa,1,1,Lethbridge,Canada,741,ROMAN PUNK,A Thoughtful Panorama of a Mad Cow And a Student who must Battle a Forensic Psychologist in Berlin,2006,English,NC-17,0.99,20050525,Wednesday,25,Weekday,May,2,2005,20050531,Tuesday,31,Weekday,May,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
129,129,23,LEWIS,SARAH,Liepaja,Latvia,1,1,Lethbridge,Canada,902,TRADING PINOCCHIO,A Emotional Character Study of a Student And a Explorer who must Discover a Frisbee in The First Manned Space Station,2006,English,PG,8.99,20050525,Wednesday,25,Weekday,May,2,2005,20050604,Saturday,4,Weekend,June,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006
90,90,25,WALKER,DEBORAH,Shikarpur,Pakistan,2,2,Woodridge,Australia,654,PANKY SUBMARINE,A Touching Documentary of a Dentist And a Sumo Wrestler who must Overcome a Boy in The Gulf of Mexico,2006,English,G,7.99,20050525,Wednesday,25,Weekday,May,2,2005,20050601,Wednesday,1,Weekday,June,2,2005,20050525,Wednesday,25,Weekday,May,2,2005,20060215,Wednesday,15,Weekday,February,1,2006


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.fact_rentals_silver

col_name,data_type,comment
fact_rental_key,bigint,
rental_key,bigint,
customer_key,bigint,
customer_last_name,string,
customer_first_name,string,
customer_city,string,
customer_country,string,
store_key,bigint,
store_num,int,
store_city,string,


##### Gold Table: Perform Aggregations
Create the gold table for fact_rentals that readily allows analysis of the sakila dvd rental business processes.

In [0]:
%sql
--the top customers that spent the most money on dvd rentals
SELECT CONCAT(rs.customer_first_name, ' '
, rs.customer_last_name) as full_name
, SUM(rs.amount) AS customer_spending
from sakila_dlh.fact_rentals_silver as rs
GROUP BY full_name
ORDER BY customer_spending DESC
LIMIT 10;


full_name,customer_spending
RONALD WEINER,13.98
ALMA AUSTIN,11.99
DELORES HANSEN,11.98
HARRY ARCE,11.98
DAVID ROYAL,10.99
MICHAEL SILVERMAN,10.98
RUTH MARTINEZ,10.98
APRIL BURNS,9.98
MANUEL MURRELL,9.98
REGINALD KINDER,8.99


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
--buying data for films: who buys the films, from where, and what type of day across the increasing prices.
SELECT 
rs.film_title
, CONCAT(rs.customer_first_name, ' '
, rs.customer_last_name) as full_name
, rs.customer_city
, rs.customer_country
, rs.store_city
, rs.store_country
, rs.rental_weekday_weekend as rented_on
, rs.return_weekday_weekend as returned_on
, rs.amount as rental_cost
from sakila_dlh.fact_rentals_silver as rs
GROUP BY full_name, film_title, amount, customer_city, store_city, customer_country, store_country, rented_on, returned_on
ORDER BY rental_cost asc;

film_title,full_name,customer_city,customer_country,store_city,store_country,rented_on,returned_on,rental_cost
BORROWERS BEDAZZLED,TANYA GILBERT,Naju,South Korea,Lethbridge,Canada,Weekday,Weekday,0.99
MONTEREY LABYRINTH,THEODORE CULP,Uluberia,India,Woodridge,Australia,Weekday,Weekday,0.99
GREEDY ROOTS,JENNIE TERRY,Olomouc,Czech Republic,Woodridge,Australia,Weekday,Weekday,0.99
PIRATES ROXANNE,GERTRUDE CASTILLO,Nuuk,Greenland,Lethbridge,Canada,Weekday,Weekend,0.99
SWEETHEARTS SUSPECTS,MARTIN BALES,Namibe,Angola,Lethbridge,Canada,Weekday,Weekday,0.99
CRAFT OUTFIELD,SHERRI RHODES,Ahmadnagar,India,Lethbridge,Canada,Weekday,Weekday,0.99
BANGER PINOCCHIO,SHERRY MARSHALL,Shubra al-Khayma,Egypt,Lethbridge,Canada,Weekday,Weekday,0.99
STALLION SUNDANCE,JOY GEORGE,Botosani,Romania,Woodridge,Australia,Weekday,Weekday,0.99
CHAINSAW UPTOWN,MARIE TURNER,Lipetsk,Russian Federation,Woodridge,Australia,Weekday,Weekday,0.99
CHICKEN HELLFIGHTERS,JUSTIN NGO,Santo André,Brazil,Lethbridge,Canada,Weekday,Weekday,0.99


Databricks visualization. Run in Databricks to view.