# Sakila EDA - DuckDB + Pandas

In [1]:
import duckdb

con = duckdb.connect("../data/sakila.duckdb")
con.sql("SET search_path='staging';")
describe = con.sql("desc;").df()

In [2]:
describe

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,sakila,staging,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,sakila,staging,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,sakila,staging,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,sakila,staging,actor,"[actor_id, first_name, last_name, last_update,...","[DECIMAL(38,9), VARCHAR, VARCHAR, TIMESTAMP, V...",False
4,sakila,staging,address,"[address_id, address, address2, district, city...","[BIGINT, VARCHAR, VARCHAR, VARCHAR, BIGINT, VA...",False
5,sakila,staging,category,"[category_id, name, last_update, _dlt_load_id,...","[BIGINT, VARCHAR, TIMESTAMP, VARCHAR, VARCHAR]",False
6,sakila,staging,city,"[city_id, city, country_id, last_update, _dlt_...","[BIGINT, VARCHAR, BIGINT, TIMESTAMP, VARCHAR, ...",False
7,sakila,staging,country,"[country_id, country, last_update, _dlt_load_i...","[BIGINT, VARCHAR, TIMESTAMP, VARCHAR, VARCHAR]",False
8,sakila,staging,customer,"[customer_id, store_id, first_name, last_name,...","[BIGINT, BIGINT, VARCHAR, VARCHAR, VARCHAR, BI...",False
9,sakila,staging,film,"[film_id, title, description, release_year, la...","[BIGINT, VARCHAR, VARCHAR, VARCHAR, BIGINT, BI...",False


### a) Which movies are longer than three hours? (Showing title and length)

In [3]:
con.sql("""--sql
SELECT
    title,
    length
FROM film
WHERE length > 180
ORDER BY length DESC;
""").df()

Unnamed: 0,title,length
0,WORST BANGER,185
1,CHICAGO NORTH,185
2,CONTROL ANTHEM,185
3,DARN FORRESTER,185
4,SWEET BROTHERHOOD,185
5,GANGS PRIDE,185
6,HOME PITY,185
7,SOLDIERS EVOLUTION,185
8,POND SEATTLE,185
9,MUSCLE BRIGHT,185


### b) Which movies have the word "love" in its title? (Showing title, rating, length, description)

In [11]:
con.sql("""--sql
SELECT
	title,
    rating,
    length,
    description
FROM
    film
WHERE
    title  ILIKE '%love%';
""").df()

Unnamed: 0,title,rating,length,description
0,GRAFFITI LOVE,PG,117,A Unbelieveable Epistle of a Sumo Wrestler And...
1,IDAHO LOVE,PG-13,172,A Fast-Paced Drama of a Student And a Crocodil...
2,IDENTITY LOVER,PG-13,119,A Boring Tale of a Composer And a Mad Cow who ...
3,INDIAN LOVE,NC-17,135,A Insightful Saga of a Mad Scientist And a Mad...
4,LAWRENCE LOVE,NC-17,175,A Fanciful Yarn of a Database Administrator An...
5,LOVE SUICIDES,R,181,A Brilliant Panorama of a Hunter And a Explore...
6,LOVELY JINGLE,PG,65,A Fanciful Yarn of a Crocodile And a Forensic ...
7,LOVER TRUMAN,G,75,A Emotional Yarn of a Robot And a Boy who must...
8,LOVERBOY ATTACKS,PG-13,162,A Boring Story of a Car And a Butler who must ...
9,STRANGELOVE DESIRE,NC-17,103,A Awe-Inspiring Panorama of a Lumberjack And a...


### c) Longest, shortest, median and average length movies

In [24]:
con.sql("""--sql
    SELECT
        MIN(length) || ' ' || 'minutes' as shortest_movie,
        MAX(length) || ' ' || 'minutes'  as longest_movie,
        AVG(length)::INT || ' ' || 'minutes'  as average_movie_length,
        MEDIAN(length)::INT  || ' ' || 'minutes' as median_movie_length
    FROM
        film
""").df()

Unnamed: 0,shortest_movie,longest_movie,average_movie_length,median_movie_length
0,46 minutes,185 minutes,115 minutes,114 minutes


### d) 10 most expensive movie to rent per day

In [40]:
rent_film = con.sql("""--sql
    SELECT
        title,
        release_year,
        description,
        (rental_rate / rental_duration) AS rent_per_day
    FROM film
    ORDER BY rent_per_day DESC
""").df()

rent_film.head(10)

Unnamed: 0,title,release_year,description,rent_per_day
0,PATHS CONTROL,2006,A Astounding Documentary of a Butler And a Cat...,1.663333
1,BACKLASH UNDEFEATED,2006,A Stunning Character Study of a Mad Scientist ...,1.663333
2,BEHAVIOR RUNAWAY,2006,A Unbelieveable Drama of a Student And a Husba...,1.663333
3,VIRTUAL SPOILERS,2006,A Fateful Tale of a Database Administrator And...,1.663333
4,TEEN APOLLO,2006,A Awe-Inspiring Drama of a Dog And a Man who m...,1.663333
5,TELEGRAPH VOYAGE,2006,A Fateful Yarn of a Husband And a Dog who must...,1.663333
6,BILKO ANONYMOUS,2006,A Emotional Reflection of a Teacher And a Man ...,1.663333
7,HEARTBREAKERS BRIGHT,2006,A Awe-Inspiring Documentary of a A Shark And a...,1.663333
8,CLERKS ANGELS,2006,A Thrilling Display of a Sumo Wrestler And a G...,1.663333
9,TIES HUNGER,2006,A Insightful Saga of a Astronaut And a Explore...,1.663333


### e) Actor who played in most movies

In [52]:
actors_in_movies = con.sql("""--sql
SELECT
	a.first_name || ' ' || a.last_name AS actor_name,
    f.title,
FROM
    actor a
LEFT JOIN film_actor fa ON fa.actor_id = a.actor_id
LEFT JOIN film f ON f.film_id = fa.film_id;
""").df()

actors_in_movies["actor_name"].value_counts().head(5)

actor_name
SUSAN DAVIS       54
GINA DEGENERES    42
WALTER TORN       41
MARY KEITEL       40
MATTHEW CARREY    39
Name: count, dtype: int64