In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine, text, inspect

Load in the CSV data file to convert it to Postgres Table

In [2]:
## run --name is for new containers
# docker run --name my-postgres \
#   -e POSTGRES_USER=postgresuser \
#   -e POSTGRES_PASSWORD=postgressql \
#   -e POSTGRES_DB=movies \
#   -p 5432:5432 \
#   -d postgres:14

In [2]:
# file path
csv_path = 'data/actor-films.csv'

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows from CSV.")

Loaded 169770 rows from CSV.


In [None]:
def query_sql(sql_engine, sql_statement):
     """Executes standard sql query with sql alchemy engine and sql command"""
     return pd.read_sql_query(sql_statement, sql_engine)

def execute_sql(sql_engine, sql_statement):
    """Executes sql action with sql alchemy engine and sql command"""
    with sql_engine.connect() as conn:
        result = conn.execute(text(sql_statement))
        if conn.commit():
            print("Executions Succesful")

def inspect_database(sql_engine):
    """Inspects and prints active table names"""
    database_inspector = inspect(sql_engine)
    tables = database_inspector.get_table_names()
    print("Tables in the database:", tables)

In [4]:
# Load .env file and get username and password
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

In [5]:
# Postgres connection details
username = DB_USER
password = DB_PASSWORD
host = DB_HOST
port = DB_PORT
database = DB_NAME

# Create the SQLalchemy engine
engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')

# Test connection
with engine.connect() as conn:
    result = conn.execute(text("SELECT version();"))
    version = result.fetchone()
    print("Connected to:", version[0])

Connected to: PostgreSQL 14.15 (Debian 14.15-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [6]:
inspect_database(engine)

Tables in the database: ['actor_films', 'actors']


In [None]:
# Drop table and types if they already exist
drop_data="""
DROP TABLE IF EXISTS actor_films;
DROP TABLE IF EXISTS actors;
DROP TYPE IF EXISTS films;
DROP TYPE IF EXISTS quality_class;
"""
execute_sql(engine, drop_data)

In [21]:
inspect_database(engine)

Tables in the database: []


In [7]:
# Write DataFrame to Postgres table named 'actor_films'
df.to_sql('actor_films', engine, if_exists='replace', index=False)
print("Data loaded into Postgres successfully!")

Data loaded into Postgres successfully!


In [8]:
# Query back data from Postgres
result = pd.read_sql_query("SELECT * FROM actor_films LIMIT 5;", engine)
result


Unnamed: 0,actor,actorid,film,year,votes,rating,filmid
0,Fred Astaire,nm0000001,Ghost Story,1981,7731,6.3,tt0082449
1,Fred Astaire,nm0000001,The Purple Taxi,1977,533,6.6,tt0076851
2,Fred Astaire,nm0000001,The Amazing Dobermans,1976,369,5.3,tt0074130
3,Fred Astaire,nm0000001,The Towering Inferno,1974,39888,7.0,tt0072308
4,Lauren Bacall,nm0000002,Ernest & Celestine,2012,18793,7.9,tt1816518


In [12]:
query_sql(engine, "SELECT * FROM actor_films WHERE actor = 'Hugh Jackman';")

Unnamed: 0,actor,actorid,film,year,votes,rating,filmid
0,Hugh Jackman,nm0413168,Bad Education,2019,33654,7.1,tt8206668
1,Hugh Jackman,nm0413168,Missing Link,2019,22582,6.7,tt6348138
2,Hugh Jackman,nm0413168,The Front Runner,2018,11768,6.1,tt7074886
3,Hugh Jackman,nm0413168,Logan,2017,658088,8.1,tt3315342
4,Hugh Jackman,nm0413168,The Greatest Showman,2017,243946,7.6,tt1485796
5,Hugh Jackman,nm0413168,X-Men: Apocalypse,2016,398093,6.9,tt3385516
6,Hugh Jackman,nm0413168,Chappie,2015,231315,6.8,tt1823672
7,Hugh Jackman,nm0413168,Me and Earl and the Dying Girl,2015,124443,7.7,tt2582496
8,Hugh Jackman,nm0413168,Eddie the Eagle,2015,86743,7.4,tt1083452
9,Hugh Jackman,nm0413168,Pan,2015,60912,5.7,tt3332064


In [9]:
create_sql = """
CREATE TYPE quality_class AS ENUM ('star', 'good', 'average', 'bad');

CREATE TYPE films AS (
    film TEXT,
    votes INTEGER,
    rating REAL,
    filmid TEXT
);

CREATE TABLE actors (
    actor TEXT,
    actorid TEXT,
    quality_class quality_class,
    is_active BOOL,
    current_year SMALLINT,
    films films[],
    PRIMARY KEY(actorid, current_year)
);
"""
execute_sql(engine, create_sql)
    


In [None]:
# %reload_ext sql
%load_ext sql
%config SqlMagic.style = 'SINGLE_BORDER'
%sql postgresql://postgresuser:postgressql@localhost:5432/movies

In [17]:
%config SqlMagic.style = '_DEPRECATED_PLAIN_COLUMNS'

In [18]:
%%sql

SELECT * FROM actor_films LIMIT 5;

 * postgresql://postgresuser:***@localhost:5432/movies
5 rows affected.


actor,actorid,film,year,votes,rating,filmid
Fred Astaire,nm0000001,Ghost Story,1981,7731,6.3,tt0082449
Fred Astaire,nm0000001,The Purple Taxi,1977,533,6.6,tt0076851
Fred Astaire,nm0000001,The Amazing Dobermans,1976,369,5.3,tt0074130
Fred Astaire,nm0000001,The Towering Inferno,1974,39888,7.0,tt0072308
Lauren Bacall,nm0000002,Ernest & Celestine,2012,18793,7.9,tt1816518


In [16]:
import prettytable

print("Keys in prettytable.__dict__:")
for key in prettytable.__dict__.keys():
    print(key)

print("\nAvailable styles via dir(prettytable):")
# Filter for uppercase attributes that might be styles
for attr in dir(prettytable):
    if attr.isupper() and not attr.startswith('__'):
        print(attr)

print("\nValue of prettytable.DEFAULT:")
try:
    print(prettytable.DEFAULT)
except AttributeError:
    print("prettytable.DEFAULT is not directly accessible as an attribute.")

Keys in prettytable.__dict__:
__name__
__doc__
__package__
__loader__
__spec__
__path__
__file__
__cached__
__builtins__
annotations
Any
_version
__version__
prettytable
_DEPRECATED_ALL
_DEPRECATED_DEFAULT
_DEPRECATED_DOUBLE_BORDER
_DEPRECATED_FRAME
_DEPRECATED_HEADER
_DEPRECATED_MARKDOWN
_DEPRECATED_MSWORD_FRIENDLY
_DEPRECATED_NONE
_DEPRECATED_ORGMODE
_DEPRECATED_PLAIN_COLUMNS
_DEPRECATED_RANDOM
_DEPRECATED_SINGLE_BORDER
HRuleStyle
PrettyTable
RowType
TableHandler
TableStyle
VRuleStyle
_warn_deprecation
from_csv
from_db_cursor
from_html
from_html_one
from_json
from_mediawiki
__all__
__getattr__

Available styles via dir(prettytable):
_DEPRECATED_ALL
_DEPRECATED_DEFAULT
_DEPRECATED_DOUBLE_BORDER
_DEPRECATED_FRAME
_DEPRECATED_HEADER
_DEPRECATED_MARKDOWN
_DEPRECATED_MSWORD_FRIENDLY
_DEPRECATED_NONE
_DEPRECATED_ORGMODE
_DEPRECATED_PLAIN_COLUMNS
_DEPRECATED_RANDOM
_DEPRECATED_SINGLE_BORDER

Value of prettytable.DEFAULT:
10


  print(prettytable.DEFAULT)


In [10]:
plpgsql_code = """
DO $$ 
DECLARE  
    start_year INT := 1969;  
    end_year INT := 2021;  
    current INT;  
BEGIN 
FOR current IN start_year..end_year-1 LOOP 
    INSERT INTO actors
    -- Your full CTE and SELECT block here --
    WITH previous_year AS (
        SELECT * FROM actors
        WHERE current_year = current
    ), 
    current_year AS (
        SELECT actor,
               actorid,
               ARRAY_AGG(
                   ARRAY[ROW(
                       film,
                       votes,
                       rating,
                       filmid
                   )::films] 
               ORDER BY rating DESC
               ) AS films,
               year AS current_year
        FROM actor_films
        WHERE year = current + 1
        GROUP BY actor, actorid, year
    ), 
    combined_years AS (
        SELECT COALESCE(cy.actor, py.actor) AS actor,
               COALESCE(cy.actorid, py.actorid) AS actorid,    
               CASE
                 WHEN cy.current_year IS NOT NULL THEN TRUE
                 ELSE FALSE
               END AS is_active,
               COALESCE(cy.current_year, py.current_year +1) AS current_year,
               CASE 
                 WHEN py.films IS NULL THEN cy.films
                 WHEN cy.current_year IS NOT NULL THEN py.films || cy.films
                 ELSE py.films
               END        
        FROM current_year as cy
        FULL OUTER JOIN previous_year as py
        ON cy.actorid = py.actorid
    )
    SELECT 
        cy.actor,
        cy.actorid,
        CASE 
            WHEN avg_rating > 8 THEN 'star'
            WHEN avg_rating > 7 THEN 'good'
            WHEN avg_rating > 6 THEN 'average'
            ELSE 'bad'
        END::quality_class AS quality_class,
        cy.is_active,
        cy.current_year,
        cy.films
    FROM (
        SELECT 
            actor,
            actorid,
            is_active,
            current_year,
            films,
            (SELECT AVG((f).rating) 
             FROM unnest(films) AS f) AS avg_rating
        FROM combined_years
    ) AS cy;
END LOOP;  
END $$;
"""

execute_sql(engine, plpgsql_code)

print("Actors table populated!")


Actors table populated!


In [14]:
query_sql(engine, "SELECT * FROM actors WHERE actor='Ralph Fiennes'")

Unnamed: 0,actor,actorid,quality_class,is_active,current_year,films
0,Ralph Fiennes,nm0000146,average,True,1992,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)""}}"
1,Ralph Fiennes,nm0000146,good,True,1996,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
2,Ralph Fiennes,nm0000146,good,True,1993,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
3,Ralph Fiennes,nm0000146,good,True,1995,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
4,Ralph Fiennes,nm0000146,good,True,1994,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
5,Ralph Fiennes,nm0000146,average,True,1998,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
6,Ralph Fiennes,nm0000146,good,True,1997,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
7,Ralph Fiennes,nm0000146,average,True,2007,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
8,Ralph Fiennes,nm0000146,average,True,2000,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
9,Ralph Fiennes,nm0000146,average,True,2002,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."


In [20]:
%%sql

CREATE TYPE quality_class AS ENUM ('star', 'good', 'average', 'bad');

CREATE TYPE films AS (
    film TEXT,
    votes INTEGER,
    rating REAL,
    filmid TEXT
);

CREATE TABLE actors (
    actor TEXT,
    actorid TEXT,
    quality_class quality_class,
    is_active BOOL,
    current_year SMALLINT,
    films films[],
    PRIMARY KEY(actorid, current_year)
);

 * postgresql://postgresuser:***@localhost:5432/movies
Done.
Done.
Done.


[]

In [21]:
%%sql

DO $$ 
DECLARE  
    start_year INT := 1969;  
    end_year INT := 2021;  
    current INT;  
BEGIN 
FOR current IN start_year..end_year-1 LOOP 
    INSERT INTO actors
    -- Your full CTE and SELECT block here --
    WITH previous_year AS (
        SELECT * FROM actors
        WHERE current_year = current
    ), 
    current_year AS (
        SELECT actor,
               actorid,
               ARRAY_AGG(
                   ARRAY[ROW(
                       film,
                       votes,
                       rating,
                       filmid
                   )::films] 
               ORDER BY rating DESC
               ) AS films,
               year AS current_year
        FROM actor_films
        WHERE year = current + 1
        GROUP BY actor, actorid, year
    ), 
    combined_years AS (
        SELECT COALESCE(cy.actor, py.actor) AS actor,
               COALESCE(cy.actorid, py.actorid) AS actorid,    
               CASE
                 WHEN cy.current_year IS NOT NULL THEN TRUE
                 ELSE FALSE
               END AS is_active,
               COALESCE(cy.current_year, py.current_year +1) AS current_year,
               CASE 
                 WHEN py.films IS NULL THEN cy.films
                 WHEN cy.current_year IS NOT NULL THEN py.films || cy.films
                 ELSE py.films
               END        
        FROM current_year as cy
        FULL OUTER JOIN previous_year as py
        ON cy.actorid = py.actorid
    )
    SELECT 
        cy.actor,
        cy.actorid,
        CASE 
            WHEN avg_rating > 8 THEN 'star'
            WHEN avg_rating > 7 THEN 'good'
            WHEN avg_rating > 6 THEN 'average'
            ELSE 'bad'
        END::quality_class AS quality_class,
        cy.is_active,
        cy.current_year,
        cy.films
    FROM (
        SELECT 
            actor,
            actorid,
            is_active,
            current_year,
            films,
            (SELECT AVG((f).rating) 
             FROM unnest(films) AS f) AS avg_rating
        FROM combined_years
    ) AS cy;
END LOOP;  
END $$;

 * postgresql://postgresuser:***@localhost:5432/movies
Done.


[]

In [22]:
%%sql

SELECT * FROM actors WHERE actor = 'Hugh Jackman' LIMIT 5;

 * postgresql://postgresuser:***@localhost:5432/movies
5 rows affected.


actor,actorid,quality_class,is_active,current_year,films
Hugh Jackman,nm0413168,average,True,1999,"{{""(\""Paperback Hero\"",2041,6.5,tt0180037)""},{""(\""Erskineville Kings\"",497,6.5,tt0212936)""}}"
Hugh Jackman,nm0413168,average,True,2000,"{{""(\""Paperback Hero\"",2041,6.5,tt0180037)""},{""(\""Erskineville Kings\"",497,6.5,tt0212936)""},{""(X-Men,575186,7.4,tt0120903)""}}"
Hugh Jackman,nm0413168,average,True,2001,"{{""(\""Paperback Hero\"",2041,6.5,tt0180037)""},{""(\""Erskineville Kings\"",497,6.5,tt0212936)""},{""(X-Men,575186,7.4,tt0120903)""},{""(Swordfish,176924,6.5,tt0244244)""},{""(\""Kate & Leopold\"",79430,6.4,tt0035423)""},{""(\""Someone Like You...\"",23635,6.1,tt0244970)""}}"
Hugh Jackman,nm0413168,average,False,2002,"{{""(\""Paperback Hero\"",2041,6.5,tt0180037)""},{""(\""Erskineville Kings\"",497,6.5,tt0212936)""},{""(X-Men,575186,7.4,tt0120903)""},{""(Swordfish,176924,6.5,tt0244244)""},{""(\""Kate & Leopold\"",79430,6.4,tt0035423)""},{""(\""Someone Like You...\"",23635,6.1,tt0244970)""}}"
Hugh Jackman,nm0413168,average,True,2003,"{{""(\""Paperback Hero\"",2041,6.5,tt0180037)""},{""(\""Erskineville Kings\"",497,6.5,tt0212936)""},{""(X-Men,575186,7.4,tt0120903)""},{""(Swordfish,176924,6.5,tt0244244)""},{""(\""Kate & Leopold\"",79430,6.4,tt0035423)""},{""(\""Someone Like You...\"",23635,6.1,tt0244970)""},{""(\""X-Men 2\"",515580,7.4,tt0290334)""}}"


In [46]:
%%sql
WITH get_actor AS (
    SELECT actor, quality_class, is_active,
       UNNEST(films) AS movies
    FROM actors
    WHERE actor = 'Hugh Jackman' AND current_year = 2012
)

SELECT actor, quality_class, is_active, 
     (movies::films).film as film,
     (movies::films).rating as rating

FROM get_actor;

 * postgresql://postgresuser:***@localhost:5432/movies
23 rows affected.


actor,quality_class,is_active,film,rating
Hugh Jackman,average,True,Paperback Hero,6.5
Hugh Jackman,average,True,Erskineville Kings,6.5
Hugh Jackman,average,True,X-Men,7.4
Hugh Jackman,average,True,Swordfish,6.5
Hugh Jackman,average,True,Kate & Leopold,6.4
Hugh Jackman,average,True,Someone Like You...,6.1
Hugh Jackman,average,True,X-Men 2,7.4
Hugh Jackman,average,True,Van Helsing,6.1
Hugh Jackman,average,True,The Prestige,8.5
Hugh Jackman,average,True,The Fountain,7.2


In [13]:
query_sql(engine, "SELECT actor, quality_class, is_active, UNNEST(films) AS movies FROM actors WHERE actor = 'Hugh Jackman' AND current_year = 2012;")

Unnamed: 0,actor,quality_class,is_active,movies
0,Hugh Jackman,average,True,"(""Paperback Hero"",2041,6.5,tt0180037)"
1,Hugh Jackman,average,True,"(""Erskineville Kings"",497,6.5,tt0212936)"
2,Hugh Jackman,average,True,"(X-Men,575186,7.4,tt0120903)"
3,Hugh Jackman,average,True,"(Swordfish,176924,6.5,tt0244244)"
4,Hugh Jackman,average,True,"(""Kate & Leopold"",79430,6.4,tt0035423)"
5,Hugh Jackman,average,True,"(""Someone Like You..."",23635,6.1,tt0244970)"
6,Hugh Jackman,average,True,"(""X-Men 2"",515580,7.4,tt0290334)"
7,Hugh Jackman,average,True,"(""Van Helsing"",246540,6.1,tt0338526)"
8,Hugh Jackman,average,True,"(""The Prestige"",1205150,8.5,tt0482571)"
9,Hugh Jackman,average,True,"(""The Fountain"",229813,7.2,tt0414993)"
