In [1]:
import pandas as pd
import os, sys

sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# Athena connection
conn = get_athena_connection()
print(f"Connected to Athena database:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Execute a SQL query on Athena (MovieLens 32M) and return a Pandas DataFrame.
    """
    return read_sql_df(sql, conn=conn)

Connected to Athena database:'movielens32m'


In [2]:
# 4) Verification in Athena

# Show all tables in movielens32m
run_sql(f"SHOW TABLES IN {DB_ATHENA}")

# Preview the movies_parquet table
run_sql(f"SELECT * FROM {DB_ATHENA}.movies_parquet LIMIT 5")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
#================================================
# INITIAL EXPLORATION OF THE "movies" TABLE
#================================================


In [3]:
run_sql(f"SHOW COLUMNS FROM {DB_ATHENA}.movies_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,field
0,movieid
1,title
2,genres


In [4]:
run_sql(f"""
SELECT
    column_name,
    data_type,
    is_nullable
FROM information_schema.columns
WHERE table_schema = '{DB_ATHENA}'
  AND table_name   = 'movies_parquet'
ORDER BY ordinal_position
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,movieid,bigint,YES
1,title,varchar,YES
2,genres,varchar,YES


#### Comment
- `movieId`: BIGINT (64-bit integer value)  
- `title`: movie name (VARCHAR, text)  
- `genres`: genre names (VARCHAR, text)

- The column `is_nullable` is `YES` for all fields.  
  - This means the table allows null (NULL) values.  
  - Because the data was loaded from external files, there are no enforced NOT NULL constraints.


In [5]:
# View the first 10 rows
run_sql(f"SELECT * FROM {DB_ATHENA}.movies_parquet LIMIT 10")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [6]:
# Count total number of rows
run_sql(f"SELECT COUNT(*) AS total_movies FROM {DB_ATHENA}.movies_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,total_movies
0,87585


#### Conclusion

- This table is used to identify the movies present in the current database.  
- It lists the ID of each movie and its associated genre(s).  
- The 10 first from de 32M DB is the same as the 100K DB. 
- The table contains a total of 87.585 movies (vs the 86 537 of 100K DB).
- Each movie can belong to more than one genre.

In [7]:
# Count missing values per column
run_sql(f"""
SELECT
    COUNT(*) - COUNT(movieId) AS missing_movieId,
    COUNT(*) - COUNT(title)   AS missing_title,
    COUNT(*) - COUNT(genres)  AS missing_genres
FROM {DB_ATHENA}.movies_parquet
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,missing_movieId,missing_title,missing_genres
0,0,0,0


#### Conclusion:
 - There are no missing values

In [8]:
# VALIDATION - Check number of genres in the first movie

run_sql(f"""
SELECT
    movieId,
    title,
    genres,
    cardinality(split(genres, '|')) AS n_genres
FROM {DB_ATHENA}.movies_parquet
WHERE movieId = 1
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieId,title,genres,n_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5


In [9]:
# View distribution of genre combinations
run_sql(f"""
SELECT genres, COUNT(*) AS n
FROM {DB_ATHENA}.movies_parquet
GROUP BY genres
ORDER BY n DESC
LIMIT 20
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,genres,n
0,Drama,12443
1,Documentary,8132
2,Comedy,7761
3,(no genres listed),7080
4,Comedy|Drama,3245
5,Drama|Romance,2825
6,Horror,2487
7,Comedy|Romance,2229
8,Thriller,1410
9,Comedy|Drama|Romance,1335


### Conclusion

It is observed that there may be mixed genres, which can make it difficult to quantify each genre.  
For example: `Drama` , `Comedy` or `Drama|Comedy`, etc., which can complicate counting and analysis.
The values are very similiar to the 100K DB, pratically the same.

In [10]:
# Count how many movies we have per individual genre
run_sql(f"""
SELECT
    genre,
    COUNT(DISTINCT movieId) AS total_movies
FROM {DB_ATHENA}.movies_parquet
CROSS JOIN UNNEST(split(genres, '|')) AS g(genre)
GROUP BY genre
ORDER BY total_movies DESC
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,genre,total_movies
0,Drama,34175
1,Comedy,23124
2,Thriller,11823
3,Romance,10369
4,Action,9668
5,Documentary,9363
6,Horror,8654
7,(no genres listed),7080
8,Crime,6976
9,Adventure,5402


In [11]:
# Check how many genres each movie has on average
run_sql(f"""
SELECT
    AVG(cardinality(split(genres, '|'))) AS avg_genres_per_movie
FROM {DB_ATHENA}.movies_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,avg_genres_per_movie
0,1.760233


In [12]:
# Show the movies with the highest number of genres
run_sql(f"""
SELECT
    title,
    genres,
    cardinality(split(genres, '|')) AS n_genres
FROM {DB_ATHENA}.movies_parquet
ORDER BY n_genres DESC, title
LIMIT 10
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,title,genres,n_genres
0,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...,10
1,Motorama (1991),Adventure|Comedy|Crime|Drama|Fantasy|Mystery|S...,8
2,"Wonderful World of the Brothers Grimm, The (1962)",Adventure|Animation|Children|Comedy|Drama|Fant...,8
3,2012: An Awakening (2009),Action|Documentary|Drama|Horror|Mystery|Sci-Fi...,7
4,Aelita: The Queen of Mars (Aelita) (1924),Action|Adventure|Drama|Fantasy|Romance|Sci-Fi|...,7
5,Aqua Teen Hunger Force Colon Movie Film for Th...,Action|Adventure|Animation|Comedy|Fantasy|Myst...,7
6,Black Butler (2014),Action|Adventure|Crime|Drama|Fantasy|Horror|My...,7
7,Calcutta (1947),Action|Crime|Drama|Film-Noir|Mystery|Romance|T...,7
8,Creators: The Past (2020),Action|Adventure|Drama|Fantasy|Mystery|Romance...,7
9,Enchanted (2007),Adventure|Animation|Children|Comedy|Fantasy|Mu...,7


### Conclusion:
The MovieLens 32M dataset shows patterns very similar to the smaller 100k version, but with clearer and more stable distributions due to its larger size.

First, the structure and content of the movies table are consistent between both datasets: same columns, same genre format, and no missing values in the core fields.

Second, the average number of genres per movie remains low in both datasets, with most titles having one or two genres, and only a small set combining three or more.

Finally, when ranking movies by number of genres, both datasets show similar behavior: a few movies accumulate many genre tags, but the vast majority fall into simple, common combinations such as Drama, Comedy, or Action.

#### Fechar a ligação

In [13]:
conn.close()
print("Athena's connection closed.")

Athena's connection closed.
