## Setup the connection

In [1]:
import boto3
import json
import psycopg2

In [2]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

creds = get_secret("wysde")
USERNAME = creds["RDS_POSTGRES_USERNAME"]
PASSWORD = creds["RDS_POSTGRES_PASSWORD"]
HOST = creds["RDS_POSTGRES_HOST"]
DATABASE = 'pagila'

conn_str = 'postgresql://{0}:{1}@{2}/{3}'.format(USERNAME, PASSWORD, HOST, DATABASE)

%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql
%sql {conn_str}

## Creating Facts & Dimensions 

In [6]:
%%sql 
CREATE TABLE dimDate (
    date_key integer NOT NULL PRIMARY KEY,
    date date NOT NULL,
    year smallint NOT NULL,
    quarter smallint NOT NULL,
    month smallint NOT NULL,
    day smallint NOT NULL,
    week smallint NOT NULL,
    is_weekend boolean
)

In [7]:
%%sql
CREATE TABLE dimCustomer (
    customer_key SERIAL PRIMARY KEY,
    customer_id smallint NOT NULL,
    first_name varchar(45) NOT NULL,
    last_name varchar(45) NOT NULL,
    email varchar(50),
    address varchar(50) NOT NULL,
    address2 varchar(50),
    district varchar(20) NOT NULL,
    city varchar(50) NOT NULL,
    country varchar(50) NOT NULL,
    postal_code varchar(10),
    phone varchar(20) NOT NULL,
    active smallint NOT NULL,
    create_date timestamp NOT NULL,
    start_date date NOT NULL,
    end_date date NOT NULL
)

In [8]:
%%sql
CREATE TABLE dimMovie (
    movie_key SERIAL PRIMARY KEY,
    film_id smallint NOT NULL,
    title varchar(255) NOT NULL,
    description text,
    release_year year,
    language varchar(20) NOT NULL,
    original_language varchar(20),
    rental_duration smallint NOT NULL,
    length smallint NOT NULL,
    rating varchar(5) NOT NULL,
    special_features varchar(40) NOT NULL
)

In [24]:
%sql ALTER TABLE dimMovie ALTER COLUMN special_features TYPE text

In [9]:
%%sql
CREATE TABLE dimStore (
    store_key SERIAL PRIMARY KEY,
    store_id smallint NOT NULL,
    address varchar(50) NOT NULL,
    address2 varchar(50),
    district varchar(20) NOT NULL,
    city varchar(50) NOT NULL,
    country varchar(50) NOT NULL,
    postal_code varchar(10),
    manager_first_name varchar(45) NOT NULL,
    manager_last_name varchar(45) NOT NULL,
    start_date date NOT NULL,
    end_date date NOT NULL
)

In [10]:
%%sql
CREATE TABLE factSales (
    sales_key SERIAL PRIMARY KEY,
    date_key INT NOT NULL REFERENCES dimDate(date_key),
    customer_key INT NOT NULL REFERENCES dimCustomer(customer_key),
    movie_key INT NOT NULL REFERENCES dimMovie(movie_key),
    store_key INT NOT NULL REFERENCES dimStore(store_key),
    sales_amount decimal(5, 2) NOT NULL
)

In [None]:
# %sql ALTER TABLE factSales ALTER COLUMN sales_key TYPE SERIAL PRIMARY KEY AUTO INCREMENT;

## ETL the data from 3NF to Facts & Dimensions

In [42]:
%%sql
INSERT INTO dimDate (
        date_key,
        date,
        year,
        quarter,
        month,
        day,
        week,
        is_weekend
    )
SELECT DISTINCT TO_CHAR(payment_date::DATE, 'yyyyMMDD')::integer AS date_key,
    date(payment_date) AS date,
    EXTRACT(
        year
        FROM payment_date
    ) AS year,
    EXTRACT(
        quarter
        FROM payment_date
    ) AS quater,
    EXTRACT(
        month
        FROM payment_date
    ) AS month,
    EXTRACT(
        day
        FROM payment_date
    ) AS day,
    EXTRACT(
        week
        FROM payment_date
    ) AS week,
    CASE
        WHEN EXTRACT(
            ISODOW
            FROM payment_date
        ) IN (6, 7) THEN true
        ELSE false
    END AS is_weekend
FROM payment

In [15]:
%%sql
INSERT INTO dimCustomer (
        customer_key,
        customer_id,
        first_name,
        last_name,
        email,
        address,
        address2,
        district,
        city,
        country,
        postal_code,
        phone,
        active,
        create_date,
        start_date,
        end_date
    )
SELECT c.customer_id AS customer_key,
    c.customer_id,
    c.first_name,
    c.last_name,
    c.email,
    a.address,
    a.address2,
    a.district,
    ci.city,
    co.country,
    postal_code,
    a.phone,
    c.active,
    c.create_date,
    now() AS start_date,
    now() AS end_date
FROM customer c
    JOIN address a ON (c.address_id = a.address_id)
    JOIN city ci ON (a.city_id = ci.city_id)
    JOIN country co ON (ci.country_id = co.country_id)

In [25]:
%%sql
INSERT INTO dimMovie (
        movie_key,
        film_id,
        title,
        description,
        release_year,
        language,
        original_language,
        rental_duration,
        length,
        rating,
        special_features
    )
SELECT f.film_id AS movie_key,
    film_id,
    f.title,
    f.description,
    f.release_year,
    l.name AS language,
    orig_lang.name AS original_language,
    f.rental_duration,
    f.length,
    f.rating,
    f.special_features
FROM film f
    JOIN language l on (f.language_id = l.language_id)
    LEFT JOIN language orig_lang ON (f.language_id = orig_lang.language_id)

In [27]:
%%sql
INSERT INTO dimStore (
        store_key,
        store_id,
        address,
        address2,
        district,
        city,
        country,
        postal_code,
        manager_first_name,
        manager_last_name,
        start_date,
        end_date
    )
SELECT s.store_id AS store_key,
    s.store_id,
    a.address,
    a.address2,
    a.district,
    c.city,
    co.country,
    a.postal_code,
    st.first_name AS manager_first_name,
    st.last_name AS manager_last_name,
    now() AS start_date,
    now() AS end_date
FROM store s
    JOIN staff st ON (s.manager_staff_id = st.staff_id)
    JOIN address a ON (s.address_id = a.address_id)
    JOIN city c ON (a.city_id = c.city_id)
    JOIN country co ON (c.country_id = co.country_id)

In [31]:
%sql select * from inventory limit 5

Unnamed: 0,inventory_id,film_id,store_id,last_update
0,1,1,1,2006-02-15 10:09:17
1,2,1,1,2006-02-15 10:09:17
2,3,1,1,2006-02-15 10:09:17
3,4,1,1,2006-02-15 10:09:17
4,5,1,2,2006-02-15 10:09:17


In [43]:
%%sql
INSERT INTO factSales (
        date_key,
        customer_key,
        movie_key,
        store_key,
        sales_amount
    )
SELECT TO_CHAR(p.payment_date::DATE, 'yyyyMMDD')::integer AS date_key,
    p.customer_id AS customer_key,
    i.film_id AS movie_key,
    i.store_id AS store_key,
    p.amount AS sales_amount
FROM payment p
    JOIN rental r ON (p.rental_id = r.rental_id)
    JOIN inventory i ON (r.inventory_id = i.inventory_id)

## 3NF vs Star Schema

In [44]:
%%time
%%sql
SELECT dimMovie.title,
    dimDate.month,
    dimCustomer.city,
    sales_amount
FROM factSales
    JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
    JOIN dimDate on (dimDate.date_key = factSales.date_key)
    JOIN dimCustomer on (
        dimCustomer.customer_key = factSales.customer_key
    )
limit 5;

CPU times: user 7.72 ms, sys: 17.1 ms, total: 24.8 ms
Wall time: 770 ms


Unnamed: 0,title,month,city,sales_amount
0,Rules Human,2,Ede,7.99
1,Majestic Floats,2,Ede,1.99
2,Maiden Home,2,Ede,7.99
3,Hyde Doctor,2,Ede,2.99
4,Massacre Usual,2,Ede,7.99


In [45]:
%%time
%%sql
SELECT dimMovie.title,
    dimDate.month,
    dimCustomer.city,
    sum(sales_amount) as revenue
FROM factSales
    JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
    JOIN dimDate on (dimDate.date_key = factSales.date_key)
    JOIN dimCustomer on (
        dimCustomer.customer_key = factSales.customer_key
    )
group by (dimMovie.title, dimDate.month, dimCustomer.city)
order by dimMovie.title,
    dimDate.month,
    dimCustomer.city,
    revenue desc;

CPU times: user 56.5 ms, sys: 28.2 ms, total: 84.7 ms
Wall time: 2.85 s


Unnamed: 0,title,month,city,revenue
0,Academy Dinosaur,2,San Lorenzo,0.99
1,Academy Dinosaur,2,Sullana,1.99
2,Academy Dinosaur,2,Udaipur,0.99
3,Academy Dinosaur,3,Almirante Brown,1.99
4,Academy Dinosaur,3,Goinia,0.99
...,...,...,...,...
14535,Zorro Ark,4,Koriyama,4.99
14536,Zorro Ark,4,Lungtan,7.99
14537,Zorro Ark,4,Nanyang,9.99
14538,Zorro Ark,4,Qomsheh,9.99


In [46]:
%%time
%%sql
SELECT f.title,
    EXTRACT(
        month
        From p.payment_date
    ) as month,
    ci.city,
    sum(p.amount) as revenue
FROM payment p
    JOIN rental r on (p.rental_id = r.rental_id)
    JOIN inventory i on (r.inventory_id = i.inventory_id)
    JOIN film f on (i.film_id = f.film_id)
    JOIN customer c on (p.customer_id = c.customer_id)
    JOIN address a on (c.address_id = a.address_id)
    JOIN city ci on (a.city_id = ci.city_id)
group by (f.title, month, ci.city)
order by f.title,
    month,
    ci.city,
    revenue desc;

CPU times: user 117 ms, sys: 25.2 ms, total: 142 ms
Wall time: 4.92 s


Unnamed: 0,title,month,city,revenue
0,Academy Dinosaur,2,San Lorenzo,0.99
1,Academy Dinosaur,2,Sullana,1.99
2,Academy Dinosaur,2,Udaipur,0.99
3,Academy Dinosaur,3,Almirante Brown,1.99
4,Academy Dinosaur,3,Goinia,0.99
...,...,...,...,...
14535,Zorro Ark,4,Koriyama,4.99
14536,Zorro Ark,4,Lungtan,7.99
14537,Zorro Ark,4,Nanyang,9.99
14538,Zorro Ark,4,Qomsheh,9.99


In [None]:
# %%sql 

# CREATE TABLE dimDate
# (
#     date_key integer NOT NULL PRIMARY KEY,
#     date date NOT NULL,
#     year smallint NOT NULL,
#     quarter smallint NOT NULL,
#     month smallint NOT NULL,
#     day smallint NOT NULL,
#     week smallint NOT NULL,
#     is_weekend boolean
# );


# CREATE TABLE dimCustomer
# (
#     customer_key SERIAL PRIMARY KEY,
#     customer_id smallint NOT NULL,
#     first_name varchar(45) NOT NULL,
#     last_name varchar(45) NOT NULL,
#     email varchar(50),
#     address varchar(50) NOT NULL,
#     address2 varchar(50),
#     district varchar(20) NOT NULL,
#     city varchar(50) NOT NULL,
#     country varchar(50) NOT NULL,
#     postal_code varchar(10),
#     phone varchar(20) NOT NULL,
#     active smallint NOT NULL,
#     create_date timestamp NOT NULL,
#     start_date date NOT NULL,
#     end_date date NOT NULL
# );



# CREATE TABLE dimMovie
# (
#     movie_key SERIAL PRIMARY KEY,
#     film_id smallint NOT NULL,
#     title varchar(255) NOT NULL,
#     description text,
#     release_year year,
#     language varchar(20) NOT NULL,
#     original_language varchar(20),
#     rental_duration smallint NOT NULL,
#     length smallint NOT NULL,
#     rating varchar(5) NOT NULL,
#     special_features varchar(400) NOT NULL
# );


# CREATE TABLE dimStore
# (
#     store_key SERIAL PRIMARY KEY,
#     store_id smallint NOT NULL,
#     address varchar(50) NOT NULL,
#     address2 varchar(50),
#     district varchar(20) NOT NULL,
#     city varchar(50) NOT NULL,
#     country varchar(50) NOT NULL,
#     postal_code varchar(10),
#     manager_first_name varchar(45) NOT NULL,
#     manager_last_name varchar(45) NOT NULL,
#     start_date date NOT NULL,
#     end_date date NOT NULL
# );


# CREATE TABLE factSales
# (
#     sales_key SERIAL PRIMARY KEY,
#     date_key INT NOT NULL REFERENCES dimDate(date_key),
#     customer_key INT NOT NULL REFERENCES dimCustomer(customer_key),
#     movie_key INT NOT NULL REFERENCES dimMovie(movie_key),
#     store_key INT NOT NULL REFERENCES dimStore(store_key),
#     sales_amount decimal(5,2) NOT NULL
# );

Tip: ISODOW is a "a day of the week function" ::: 6,7 --> saturday & sunday 

In [None]:
# %%sql 
# INSERT INTO dimDate (date_key, date, year, quarter, month, day, week, is_weekend)
# SELECT DISTINCT (TO_CHAR(payment_date :: DATE, 'yyyyMMDD'):: integer) AS date_key,
#                          date(payment_date) AS date,
#                          EXTRACT(year FROM payment_date) AS year,
#                          EXTRACT(quarter FROM payment_date) AS quarter,
#                          EXTRACT(month FROM payment_date) AS month,
#                          EXTRACT(day FROM payment_date) AS day,
#                          EXTRACT(week FROM payment_date) AS week,
#                          CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) THEN true ELSE false END AS is_weekend
#                         FROM payment;
        
        
# INSERT INTO dimCustomer (customer_key, customer_id, first_name, last_name, email, address, address2, district, city, country, postal_code, phone, active, create_date, start_date, end_date)
# SELECT c.customer_id AS customer_key,
#                          c.customer_id,
#                          c.first_name,
#                          c.last_name,
#                          c.email,
#                          a.address,
#                          a.address2,
#                          a.district,
#                          ci.city,
#                          co.country,
#                          postal_code,
#                          a.phone,
#                          c.active,
#                          c.create_date,
#                          now() AS start_date,
#                          now() AS end_date
# FROM customer c
# JOIN address a ON (c.address_id = a.address_id)
# JOIN city ci ON (a.city_id = ci.city_id)
# JOIN country co ON (ci.country_id = co.country_id);        
                
        
# INSERT INTO dimMovie (movie_key, film_id, title, description, release_year, language, original_language, rental_duration, length, rating, special_features)
# SELECT f.film_id AS movie_key,
#                         film_id,
#                         f.title,
#                         f.description,
#                         f.release_year,
#                         l.name AS language,
#                         l.name AS original_language,
#                         f.rental_duration,
#                         f.length,
#                         f.rating,
#                         f.special_features
# FROM film f
# JOIN language l on (f.language_id = l.language_id);
    
    
# INSERT INTO dimStore (store_key, store_id, address, address2, district, city, country, postal_code, manager_first_name, manager_last_name, start_date, end_date)
# SELECT s.store_id AS store_key,
#         s.store_id,
#         a.address,
#         a.address2,
#         a.district,
#         c.city,
#         co.country,
#         a.postal_code,
#         st.first_name AS manager_first_name,
#         st.last_name AS manager_last_name,
#         now() AS start_date,
#         now() AS end_date
# FROM store s
# JOIN staff st ON (s.manager_staff_id = st.staff_id)
# JOIN address a ON (s.address_id = a.address_id)
# JOIN city c ON (a.city_id = c.city_id)
# JOIN country co ON (c.country_id = co.country_id);
    
    
# INSERT INTO factSales (date_key, customer_key, movie_key, store_key, sales_amount)
# SELECT TO_CHAR(p.payment_date :: DATE, 'yyyyMMDD')::integer AS date_key,
#                         p.customer_id AS customer_key,
#                         i.film_id AS movie_key,
#                         i.store_id AS store_key,
#                       p.amount AS sales_amount
# FROM payment p
# JOIN rental r ON (p.rental_id = r.rental_id)
# JOIN inventory i ON (r.inventory_id = i.inventory_id);