In [None]:
#================================================
# DATA
#================================================

import duckdb, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# cria ou liga se já existir:
con = duckdb.connect("movielens100K.duckdb")

In [None]:
IMDB_DIR = Path("..") / "data" / "Imdb"
IMDB_DIR

### Análise ficheiro 1: "name.basics.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_name_basics AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "name.basics.tsv"}', delim='\t', header=TRUE)
""")


In [None]:
con.sql("PRAGMA table_info('imdb_name_basics')").df()

In [None]:
con.sql("SELECT * FROM imdb_name_basics LIMIT 10").df()

### Análise ficheiro 2: "title.akas.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_title_akas AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.akas.tsv"}', delim='\t', header=TRUE)
""")


In [None]:
con.sql("PRAGMA table_info('imdb_title_akas')").df()

In [None]:
con.sql("SELECT * FROM imdb_title_akas LIMIT 10").df()

### Análise ficheiro 3: "title.basics.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_title_basics AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.basics.tsv"}', delim='\t', header=TRUE)
""")


In [None]:
con.sql("PRAGMA table_info('imdb_title_basics')").df()

In [None]:
con.sql("SELECT * FROM imdb_title_basics LIMIT 10").df()

### Análise ficheiro 4: "title.crew.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_title_crew AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.crew.tsv"}', delim='\t', header=TRUE)
""")

In [None]:
con.sql("PRAGMA table_info('imdb_title_crew')").df()

In [None]:
con.sql("SELECT * FROM imdb_title_crew LIMIT 10").df()

### Análise ficheiro 5: "title.episode.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_title_episode AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.episode.tsv"}', delim='\t', header=TRUE)
""")

In [None]:
con.sql("PRAGMA table_info('imdb_title_episode')").df()

In [None]:
con.sql("SELECT * FROM imdb_title_episode LIMIT 10").df()

### Análise ficheiro 6: "title.principals.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_principals AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.principals.tsv"}', delim='\t', header=TRUE)
""")

In [None]:
con.sql("PRAGMA table_info('imdb_principals')").df()

In [None]:
con.sql("SELECT * FROM imdb_principals LIMIT 500").df()

### Análise ficheiro 7: "title.ratings.tsv"

In [None]:
con.sql(f"""
CREATE OR REPLACE TABLE imdb_ratings AS
SELECT *
FROM read_csv_auto('{IMDB_DIR / "title.ratings.tsv"}', delim='\t', header=TRUE)
""")

In [None]:
con.sql("PRAGMA table_info('imdb_ratings')").df()

In [None]:
con.sql("SELECT * FROM imdb_ratings LIMIT 10").df()

In [None]:
con.sql("""
CREATE OR REPLACE VIEW links_norm AS
SELECT
  movieId,
  'tt' || lpad(CAST(imdbId AS VARCHAR), 7, '0') AS tconst,
  tmdbId
FROM links
""");

In [None]:
#Função de verificação ad-hoc + exemplo para

def check_movie(movie_id: int):
    return con.sql(f"""
        WITH base AS (
          SELECT
            m.movieId,
            m.title                       AS movielens_title,
            ln.tconst                     AS imdb_tconst,
            b.primaryTitle                AS imdb_title,
            b.startYear                   AS imdb_year,
            regexp_extract(m.title, '(\\d{{4}})\\)$', 1)::INT AS ml_year,
            lower(trim(regexp_replace(m.title, '\\s*\\(\\d{{4}}\\)\\s*$', ''))) AS ml_title_clean
          FROM movies m
          LEFT JOIN links_norm ln USING (movieId)
          LEFT JOIN imdb_title_basics b ON b.tconst = ln.tconst
          WHERE m.movieId = {movie_id}
        )
        SELECT
          movieId,
          movielens_title,
          imdb_tconst,
          'https://www.imdb.com/title/' || imdb_tconst || '/' AS imdb_url,
          imdb_title,
          imdb_year,
          ml_year,
          (imdb_title IS NOT NULL AND ml_title_clean = lower(trim(imdb_title))) AS same_title,
          (imdb_year  IS NOT NULL AND ml_year IS NOT NULL AND imdb_year = ml_year) AS same_year
        FROM base
    """).df()

# Exemplo: verificar o movieId = 1
check_movie(100)


#### Fechar a ligação

In [None]:
con.close()
print("Ligação fechada.")