In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine, text, inspect

Load in the CSV data file to convert it to Postgres Table

In [2]:
# file path
csv_path = 'data/actor-films.csv'

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows from CSV.")

Loaded 169770 rows from CSV.


In [3]:
def query_sql(sql_engine, sql_statement):
     return pd.read_sql_query(sql_statement, sql_engine)

def execute_sql(sql_engine, sql_statement):
    with sql_engine.connect() as conn:
        result = conn.execute(text(sql_statement))
        if conn.commit():
            print("Executions Succesful")

def inspect_database(sql_engine):
    database_inspector = inspect(sql_engine)
    tables = database_inspector.get_table_names()
    print("Tables in the database:", tables)

In [4]:
# Load .env file and get username and password
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
# host = os.getenv("DB_HOST")
# port = os.getenv("DB_PORT")
# database = os.getenv("DB_NAME")

In [5]:
# Postgres connection details
username = DB_USER
password = DB_PASSWORD
host = 'localhost'
port = 5432
database = 'movies'

# Create the SQLAlchemy engine
engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')

# Test connection
with engine.connect() as conn:
    result = conn.execute(text("SELECT version();"))
    version = result.fetchone()
    print("Connected to:", version[0])

Connected to: PostgreSQL 14.15 (Debian 14.15-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit


In [6]:
inspect_database(engine)

Tables in the database: []


In [None]:
# Drop table and types if they already exist
drop_data="""
DROP TABLE IF EXISTS actor_films;
DROP TABLE IF EXISTS actors;
DROP TYPE IF EXISTS films;
DROP TYPE IF EXISTS quality_class;
"""
execute_sql(engine, drop_data)

In [21]:
inspect_database(engine)

Tables in the database: []


In [7]:
# Write DataFrame to Postgres table named 'actor_films'
df.to_sql('actor_films', engine, if_exists='replace', index=False)
print("Data loaded into Postgres successfully!")

Data loaded into Postgres successfully!


In [8]:
# Query back data from Postgres
result = pd.read_sql_query("SELECT * FROM actor_films LIMIT 5;", engine)
result


Unnamed: 0,actor,actorid,film,year,votes,rating,filmid
0,Fred Astaire,nm0000001,Ghost Story,1981,7731,6.3,tt0082449
1,Fred Astaire,nm0000001,The Purple Taxi,1977,533,6.6,tt0076851
2,Fred Astaire,nm0000001,The Amazing Dobermans,1976,369,5.3,tt0074130
3,Fred Astaire,nm0000001,The Towering Inferno,1974,39888,7.0,tt0072308
4,Lauren Bacall,nm0000002,Ernest & Celestine,2012,18793,7.9,tt1816518


In [9]:
create_sql = """
CREATE TYPE quality_class AS ENUM ('star', 'good', 'average', 'bad');

CREATE TYPE films AS (
    film TEXT,
    votes INTEGER,
    rating REAL,
    filmid TEXT
);

CREATE TABLE actors (
    actor TEXT,
    actorid TEXT,
    quality_class quality_class,
    is_active BOOL,
    current_year SMALLINT,
    films films[],
    PRIMARY KEY(actorid, current_year)
);
"""
execute_sql(engine, create_sql)
    


In [None]:
%load_ext sql
%sql postgresql://postgresuser:postgressql@localhost:5432/movies

In [None]:
%%sql

SELECT * FROM actor_films;

In [10]:
plpgsql_code = """
DO $$ 
DECLARE  
    start_year INT := 1969;  
    end_year INT := 2021;  
    current INT;  
BEGIN 
FOR current IN start_year..end_year-1 LOOP 
    INSERT INTO actors
    -- Your full CTE and SELECT block here --
    WITH previous_year AS (
        SELECT * FROM actors
        WHERE current_year = current
    ), 
    current_year AS (
        SELECT actor,
               actorid,
               ARRAY_AGG(
                   ARRAY[ROW(
                       film,
                       votes,
                       rating,
                       filmid
                   )::films] 
               ORDER BY rating DESC
               ) AS films,
               year AS current_year
        FROM actor_films
        WHERE year = current + 1
        GROUP BY actor, actorid, year
    ), 
    combined_years AS (
        SELECT COALESCE(cy.actor, py.actor) AS actor,
               COALESCE(cy.actorid, py.actorid) AS actorid,    
               CASE
                 WHEN cy.current_year IS NOT NULL THEN TRUE
                 ELSE FALSE
               END AS is_active,
               COALESCE(cy.current_year, py.current_year +1) AS current_year,
               CASE 
                 WHEN py.films IS NULL THEN cy.films
                 WHEN cy.current_year IS NOT NULL THEN py.films || cy.films
                 ELSE py.films
               END        
        FROM current_year as cy
        FULL OUTER JOIN previous_year as py
        ON cy.actorid = py.actorid
    )
    SELECT 
        cy.actor,
        cy.actorid,
        CASE 
            WHEN avg_rating > 8 THEN 'star'
            WHEN avg_rating > 7 THEN 'good'
            WHEN avg_rating > 6 THEN 'average'
            ELSE 'bad'
        END::quality_class AS quality_class,
        cy.is_active,
        cy.current_year,
        cy.films
    FROM (
        SELECT 
            actor,
            actorid,
            is_active,
            current_year,
            films,
            (SELECT AVG((f).rating) 
             FROM unnest(films) AS f) AS avg_rating
        FROM combined_years
    ) AS cy;
END LOOP;  
END $$;
"""

execute_sql(engine, plpgsql_code)

print("Actors table populated!")


Actors table populated!


In [14]:
query_sql(engine, "SELECT * FROM actors WHERE actor='Ralph Fiennes'")

Unnamed: 0,actor,actorid,quality_class,is_active,current_year,films
0,Ralph Fiennes,nm0000146,average,True,1992,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)""}}"
1,Ralph Fiennes,nm0000146,good,True,1996,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
2,Ralph Fiennes,nm0000146,good,True,1993,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
3,Ralph Fiennes,nm0000146,good,True,1995,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
4,Ralph Fiennes,nm0000146,good,True,1994,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
5,Ralph Fiennes,nm0000146,average,True,1998,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
6,Ralph Fiennes,nm0000146,good,True,1997,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
7,Ralph Fiennes,nm0000146,average,True,2007,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
8,Ralph Fiennes,nm0000146,average,True,2000,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
9,Ralph Fiennes,nm0000146,average,True,2002,"{{""(\""Wuthering Heights\"",12384,6.8,tt0104181)..."
