## Connect to Postgres

In [15]:
import boto3
import json
import psycopg2

In [16]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

creds = get_secret("wysde")
USERNAME = creds["RDS_POSTGRES_USERNAME"]
PASSWORD = creds["RDS_POSTGRES_PASSWORD"]
HOST = creds["RDS_POSTGRES_HOST"]

In [17]:
conn = psycopg2.connect(f"host={HOST} dbname=postgres user={USERNAME} password={PASSWORD}")

cur = conn.cursor()

In [4]:
conn.set_session(autocommit=True)

## Create `pagila` database and sample table

In [5]:
cur.execute("CREATE DATABASE pagila")

In [6]:
conn.close()

In [7]:
conn = psycopg2.connect(f"host={HOST} dbname=pagila user={USERNAME} password={PASSWORD}")

cur = conn.cursor()

In [8]:
cur.execute("CREATE TABLE IF NOT EXISTS music(album_name varchar, artist_name varchar)")

## Load the DVD rental database using psql tool

Note: run this command in terminal. `echo` is helping you to get a text that you can simply paster in terminal

In [None]:
!echo pg_restore --user={USERNAME} --host={HOST} --password --dbname=pagila \< data/dvdrental.tar

## Explore the Pagila 3NF/Relational Schema

In [19]:
conn_str = 'postgresql://{0}:{1}@{2}/pagila'.format(USERNAME, PASSWORD, HOST)

%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql
%sql {conn_str}

### How much? data size we have

In [9]:
nStores = %sql select count(*) from store;
nFilms = %sql select count(*) from film;
nCustomers = %sql select count(*) from customer;
nRentals = %sql select count(*) from rental;
nPayment = %sql select count(*) from payment;
nStaff = %sql select count(*) from staff;
nCity = %sql select count(*) from city;
nCountry = %sql select count(*) from country;

print('\n')
print("nFilms\t\t=", nFilms.iloc[0][0])               # 1000
print("nCustomers\t=", nCustomers.iloc[0][0])         # 599
print("nRentals\t=", nRentals.iloc[0][0])             # 16044
print("nPayment\t=", nPayment.iloc[0][0])             # 16049
print("nStaff\t\t=", nStaff.iloc[0][0])               # 2
print("nStores\t\t=", nStores.iloc[0][0])             # 2
print("nCities\t\t=", nCity.iloc[0][0])               # 600
print("nCountry\t=", nCountry.iloc[0][0])             # 109



nFilms		= 1000
nCustomers	= 599
nRentals	= 16044
nPayment	= 14596
nStaff		= 2
nStores		= 2
nCities		= 600
nCountry	= 109


### When? What time period are we talking about?

In [10]:
%%sql 
select min(payment_date) as start, max(payment_date) as end from payment;

Unnamed: 0,start,end
0,2007-02-14 21:21:59.996577,2007-05-14 13:44:29.996577


### Where? Where do events in this database occur?

In [11]:
%%sql
SELECT district, SUM(city_id) as n FROM address
GROUP BY district
ORDER BY n desc
limit 10 ;

Unnamed: 0,district,n
0,Shandong,3237
1,England,2974
2,So Paulo,2952
3,West Bengali,2623
4,Buenos Aires,2572
5,Uttar Pradesh,2462
6,California,2444
7,Southern Tagalog,1931
8,Tamil Nadu,1807
9,Hubei,1790


## Perform some simple data analysis

### Insight 1: Top Grossing Movies

* Payments amounts are in table payment
* Movies are in table film
* They are not directly linked, payment refers to a rental, rental 
refers to inventory, and inventory refers to a film
* Payents -> rental -> inventory -> film

In [12]:
%%sql 
select film_id,
    title,
    release_year,
    rental_rate,
    rating
from film
limit 5

Unnamed: 0,film_id,title,release_year,rental_rate,rating
0,133,Chamber Italian,2006,4.99,NC-17
1,384,Grosse Wonderful,2006,4.99,R
2,8,Airport Pollock,2006,4.99,R
3,98,Bright Encounters,2006,4.99,PG-13
4,1,Academy Dinosaur,2006,0.99,PG


Get the movie of every payment Using JOIN

In [13]:
%%sql
SELECT f.title,
    p.amount,
    p.payment_date,
    p.customer_id
FROM payment p
    JOIN rental r ON (p.rental_id = r.rental_id)
    JOIN inventory i ON (r.inventory_id = i.inventory_id)
    JOIN film f ON (i.film_id = f.film_id)
limit 5

Unnamed: 0,title,amount,payment_date,customer_id
0,Academy Dinosaur,0.99,2007-04-08 17:31:41.996577,431
1,Academy Dinosaur,3.99,2007-03-02 18:41:36.996577,518
2,Academy Dinosaur,3.99,2007-03-21 19:56:09.996577,279
3,Academy Dinosaur,0.99,2007-02-17 18:52:26.996577,170
4,Academy Dinosaur,0.99,2007-04-07 09:09:57.996577,161


Get sum movie rental revenue

In [14]:
%%sql 
SELECT f.title,
    sum(p.amount) as revenue
FROM payment p
    JOIN rental r ON (p.rental_id = r.rental_id)
    JOIN inventory i ON (r.inventory_id = i.inventory_id)
    JOIN film f ON (i.film_id = f.film_id)
GROUP BY title
ORDER BY revenue desc
limit 10

Unnamed: 0,title,revenue
0,Telegraph Voyage,215.75
1,Zorro Ark,199.72
2,Wife Turn,198.73
3,Innocent Usual,191.74
4,Hustler Party,190.78
5,Saturday Lambs,190.74
6,Titans Jerk,186.73
7,Harry Idaho,177.73
8,Torque Bound,169.76
9,Dogma Family,168.72


### Insight 2: Top grossing cities

* Payments amounts are in table payment
* Cities are in table cities
* payment -> customer -> address -> city

In [20]:
%%sql 
SELECT p.customer_id,
    p.rental_id,
    p.amount,
    ci.city as revenue
FROM payment p
    JOIN customer c ON (p.customer_id = c.customer_id)
    JOIN address a ON (c.address_id = a.address_id)
    JOIN city ci ON (a.city_id = ci.city_id)
ORDER BY p.payment_date
limit 10

Unnamed: 0,customer_id,rental_id,amount,revenue
0,416,1158,2.99,Dadu
1,516,1159,4.99,Battambang
2,239,1160,4.99,Ciomas
3,592,1163,6.99,Szkesfehrvr
4,49,1164,0.99,Jedda
5,264,1165,3.99,Higashiosaka
6,46,1166,4.99,Moscow
7,481,1168,2.99,Mwanza
8,139,1169,2.99,Touliu
9,595,1170,2.99,Jinzhou


Top grossing cities

In [21]:
%%sql 
SELECT ci.city,
    sum(p.amount) as revenue
FROM payment p
    JOIN customer c ON (p.customer_id = c.customer_id)
    JOIN address a ON (c.address_id = a.address_id)
    JOIN city ci ON (a.city_id = ci.city_id)
group BY ci.city
order by revenue desc
limit 10

Unnamed: 0,city,revenue
0,Saint-Denis,211.55
1,Cape Coral,208.58
2,Santa Brbara dOeste,194.61
3,Apeldoorn,191.62
4,Molodetno,189.6
5,Qomsheh,183.63
6,London,174.54
7,Memphis,167.67
8,Richmond Hill,167.62
9,Tanza,166.61


### Insight 3: Revenue of a movie by customer city and by month

In [22]:
%%sql 
SELECT sum(p.amount) as revenue,
    EXTRACT (
        month
        FROM p.payment_date
    ) as month
FROM payment p
group BY month
order by revenue desc
limit 10

Unnamed: 0,revenue,month
0,28559.46,4
1,23886.56,3
2,8351.84,2
3,514.18,5


Each movie by customer city and by month (data cube)

In [23]:
%%sql 
SELECT f.title,
    p.amount,
    p.customer_id,
    ci.city,
    p.payment_date,
    EXTRACT(
        month
        FROM p.payment_date
    ) as month
FROM payment p
    JOIN rental r ON (p.rental_id = r.rental_id)
    JOIN inventory i ON (r.inventory_id = i.inventory_id)
    JOIN film f ON (i.film_id = f.film_id)
    JOIN customer c ON (p.customer_id = c.customer_id)
    JOIN address a ON (c.address_id = a.address_id)
    JOIN city ci ON (a.city_id = ci.city_id)
order by p.payment_date
limit 10

Unnamed: 0,title,amount,customer_id,city,payment_date,month
0,Giant Troopers,2.99,416,Dadu,2007-02-14 21:21:59.996577,2
1,Wash Heavenly,4.99,516,Battambang,2007-02-14 21:23:39.996577,2
2,Name Detective,4.99,239,Ciomas,2007-02-14 21:29:00.996577,2
3,Truman Crazy,6.99,592,Szkesfehrvr,2007-02-14 21:41:12.996577,2
4,Sleuth Orient,0.99,49,Jedda,2007-02-14 21:44:52.996577,2
5,None Spiking,3.99,264,Higashiosaka,2007-02-14 21:44:53.996577,2
6,Maiden Home,4.99,46,Moscow,2007-02-14 21:45:29.996577,2
7,Wagon Jaws,2.99,481,Mwanza,2007-02-14 22:03:35.996577,2
8,Divine Resurrection,2.99,139,Touliu,2007-02-14 22:11:22.996577,2
9,Lost Bird,2.99,595,Jinzhou,2007-02-14 22:16:01.996577,2


In [24]:
%%sql 
SELECT f.title,
    ci.city,
    EXTRACT(
        month
        FROM p.payment_date
    ) as month,
    sum(p.amount) as revenue
FROM payment p
    JOIN rental r ON (p.rental_id = r.rental_id)
    JOIN inventory i ON (r.inventory_id = i.inventory_id)
    JOIN film f ON (i.film_id = f.film_id)
    JOIN customer c ON (p.customer_id = c.customer_id)
    JOIN address a ON (c.address_id = a.address_id)
    JOIN city ci ON (a.city_id = ci.city_id)
group by (f.title, ci.city, month)
order by month,
    revenue desc
limit 10

Unnamed: 0,title,city,month,revenue
0,Innocent Usual,Valparai,2,13.98
1,Stranger Strangers,Czestochowa,2,10.99
2,Flintstones Happiness,Alessandria,2,10.99
3,Mine Titans,Plock,2,10.99
4,Doors President,Zhoushan,2,10.99
5,Saturday Lambs,Wroclaw,2,10.99
6,Satisfaction Confidential,Suihua,2,10.99
7,Autumn Crow,Stockport,2,10.99
8,Behavior Runaway,Battambang,2,10.99
9,Telegraph Voyage,Datong,2,10.99
