In [63]:
#%pip install -q duckdb python-dotenv

import pandas as pd

import os, sys
sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# ligação Athena
conn = get_athena_connection()
print(f"Connected with Athena base:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Executa SQL no Athena (MovieLens 1M) e devolve um DataFrame pandas.
    """
    return read_sql_df(sql, conn=conn)

Connected with Athena base:'movielens1m'


In [56]:
BASE = "s3://bdf25-20-movielens/curated"


# 4) Verificação, agora em Athena
run_sql("SHOW TABLES")
run_sql("SELECT * FROM movies_parquet LIMIT 5")



  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [57]:
#================================================
# EXPLORAÇÃO INICIAL DA TABELA "movies"
#================================================


In [58]:
run_sql("SHOW COLUMNS FROM movies_parquet")


  df = pd.read_sql(sql, conn)


Unnamed: 0,field
0,movieid
1,title
2,genres


In [59]:
run_sql(f"""
SELECT
    column_name,
    data_type,
    is_nullable
FROM information_schema.columns
WHERE table_schema = '{DB_ATHENA}'
  AND table_name   = 'movies_parquet'
ORDER BY ordinal_position
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,movieid,bigint,YES
1,title,varchar,YES
2,genres,varchar,YES


#### Comment
- `movieId`: BIGINT (64-bit integer value)  
- `title`: movie name (VARCHAR, text)  
- `genres`: genre names (VARCHAR, text)

- The column `is_nullable` is `YES` for all fields.  
  - This means the table allows null (NULL) values.  
  - Because the data was loaded from external files, there are no enforced NOT NULL constraints.


In [60]:
#ver as primeiras 10 linhas

run_sql("SELECT * FROM movies_parquet LIMIT 10")


  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [61]:
#Contar o número total de linhas
run_sql("SELECT COUNT(*) AS total_movies FROM movies_parquet")




  df = pd.read_sql(sql, conn)


Unnamed: 0,total_movies
0,3883


#### Conclusion

- This table is used to identify the movies present in the current database.  
- It lists the ID of each movie and its associated genre(s).  
- The 10 first from de 1M DB is the same as the 100K DB. 
- The table contains a total of 1.010.132 movies (vs the 86 537 of 100K DB).
- Each movie can belong to more than one genre.

In [62]:
#Contar o número de missing values
run_sql("""
SELECT
    COUNT(*) - COUNT(movieId) AS missing_movieId,
    COUNT(*) - COUNT(title)   AS missing_title,
    COUNT(*) - COUNT(genres)  AS missing_genres
FROM movies_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,missing_movieId,missing_title,missing_genres
0,0,0,0


#### Conclusion:
 - There are no missing values

In [None]:
# VERIFICAÇÃO - Ver o numero de genero no 1o filme

run_sql("""
SELECT
    movieId,
    title,
    genres,
    cardinality(split(genres, '|')) AS n_generos
FROM movies_parquet
WHERE movieId = 1
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieId,title,genres,n_generos
0,1,Toy Story (1995),Animation|Children's|Comedy,3
1,2,Jumanji (1995),Adventure|Children's|Fantasy,3
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,2
4,5,Father of the Bride Part II (1995),Comedy,1
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,1
3879,3949,Requiem for a Dream (2000),Drama,1
3880,3950,Tigerland (2000),Drama,1
3881,3951,Two Family House (2000),Drama,1


In [75]:
#Ver distribuição de valores
run_sql("""
SELECT genres, COUNT(*) AS n
FROM movies
GROUP BY genres
ORDER BY n DESC
LIMIT 20
""")



  df = pd.read_sql(sql, conn)


Unnamed: 0,genres,n
0,Drama,843
1,Comedy,521
2,Horror,178
3,Comedy|Drama,162
4,Comedy|Romance,142
5,Drama|Romance,134
6,Documentary,116
7,Thriller,101
8,Action,65
9,Drama|Thriller,63


In [78]:
run_sql("SELECT COUNT(*) AS total_filmes FROM movies_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,total_filmes
0,3883


In [79]:
run_sql("SELECT COUNT(*) AS total_filmes FROM movies")

  df = pd.read_sql(sql, conn)


Unnamed: 0,total_filmes
0,3883


### Conclusion

It is observed that there may be mixed genres, which can make it difficult to quantify each genre.  
For example: `Drama` , `Comedy` or `Drama|Comedy`, etc., which can complicate counting and analysis.

In [81]:
#Contar quantos filmes temos por género sabendo que o campo genres no MovieLens tem vários géneros por filme, separados por barras (|)
run_sql("""
SELECT
    genre,
    COUNT(DISTINCT movieId) AS total_filmes
FROM movies_parquet
CROSS JOIN UNNEST(split(genres, '|')) AS t(genre)
GROUP BY genre
ORDER BY total_filmes DESC
""")

Unnamed: 0,genre,total_filmes
0,Drama,1603
1,Comedy,1200
2,Action,503
3,Thriller,492
4,Romance,471
5,Horror,343
6,Adventure,283
7,Sci-Fi,276
8,Children's,251
9,Crime,211


In [83]:
#Ver quantos géneros em média tem cada filme
run_sql("""
SELECT
    AVG(cardinality(split(genres, '|'))) AS media_generos_por_filme
FROM movies_parquet
""")


Unnamed: 0,media_generos_por_filme
0,1.65027


In [86]:
#Ver filmes com o maior número de géneros
run_sql("""
SELECT
    title,
    genres,
    cardinality(split(genres, '|')) AS n_generos
FROM movies_parquet
ORDER BY n_generos DESC, title
LIMIT 10
""")


Unnamed: 0,title,genres,n_generos
0,"Transformers: The Movie, The (1986)",Action|Animation|Children's|Sci-Fi|Thriller|War,6
1,Army of Darkness (1993),Action|Adventure|Comedy|Horror|Sci-Fi,5
2,Diva (1981),Action|Drama|Mystery|Romance|Thriller,5
3,From Dusk Till Dawn (1996),Action|Comedy|Crime|Horror|Thriller,5
4,Heavy Metal (1981),Action|Adventure|Animation|Horror|Sci-Fi,5
5,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical,5
6,"Honey, I Shrunk the Kids (1989)",Adventure|Children's|Comedy|Fantasy|Sci-Fi,5
7,"Kid in King Arthur's Court, A (1995)",Adventure|Children's|Comedy|Fantasy|Romance,5
8,Lady and the Tramp (1955),Animation|Children's|Comedy|Musical|Romance,5
9,"Little Mermaid, The (1989)",Animation|Children's|Comedy|Musical|Romance,5


### Conclusion:

- Among the movies with the highest number of genres, the title “Transformers: The Movie (1986)” stands out with 6 different categories, making it the most multi-genre film in the dataset.

- The majority of the top multi-genre movies contain 5 genres, often combining elements such as Adventure, Animation, Comedy, Fantasy, Romance, Sci-Fi, and Thriller.

- This indicates that while most films in the dataset tend to have one or two categories, there is a notable subset of titles with much richer and more complex genre classifications.

- On average, movies still fall into a relatively small number of categories, but these exceptions highlight the breadth and thematic diversity within the dataset.

- Fun fact: multi-genre movies often blend unexpected combinations—which may explain why some of these titles feel particularly unique or unconventional to viewers!

1. Movies with the widest genre diversity

In the 100k dataset, the movie “Rubber (2010)” is the one with the highest number of genres, reaching 10 different categories.

In the 1M dataset, the most multi-genre movie (“Transformers: The Movie (1986)”) has 6 genres, and most of the top titles range between 5 and 6 genres.
**Conclusion**: the 100k dataset includes movies with a wider spread of genre labels at the extreme end.

2. Distribution of multi-genre movies

The 100k dataset features several films with 7–10 genres, showing a more extreme tail of highly multi-classified titles.

The 1M dataset still includes films with rich classifications, but their maximum is lower (mostly 5–6 genres).
**Conclusion**: Movies in 100k tend to have more genre tags per film at the top of the distribution than those in 1M.

3. Genre labeling consistency

In the 100k dataset, the presence of movies with very large genre combinations (up to 10) suggests less standardization or more permissive tagging.

In the 1M dataset, genre assignments appear more consistent and limited, with fewer extreme outliers and a more stable range of genre counts.
**Conclusion**: The 1M dataset seems to follow a more coherent or controlled tagging scheme, resulting in more balanced genre assignments.

#### Fechar a ligação

In [87]:
conn.close()
print("Athena's connection closed.")

Athena's connection closed.
