In [1]:
import pandas as pd
import os, sys

sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# Athena connection
conn = get_athena_connection()
print(f"Connected to Athena database:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Execute a SQL query on Athena (MovieLens 32M) and return a Pandas DataFrame.
    """
    return read_sql_df(sql, conn=conn)

Connected to Athena database:'movielens32m'


In [2]:
# 4) Verification in Athena

# Preview the links_parquet table
run_sql(f"SELECT * FROM {DB_ATHENA}.links_parquet LIMIT 5")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,imdbid,tmdbid
0,91548,1687247,66150
1,91554,154467,278822
2,91556,348572,54157
3,91558,113077,26824
4,91560,1706596,73448


### initial exploration of the table "links"


 - To know Before:
 - movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>

 - imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link --<http://www.imdb.com/title/tt0114709/>.

 - tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link --<https://www.themoviedb.org/movie/862>.

 - the use of the resources listed above is subject to the terms of each provider.


In [4]:
run_sql("SHOW COLUMNS FROM links_parquet")

Unnamed: 0,field
0,movieid
1,imdbid
2,tmdbid


In [6]:
# Inspect the structure of the links_parquet table (columns and data types)
run_sql(f"""
    SELECT
        column_name,
        data_type,
        is_nullable
    FROM information_schema.columns
    WHERE table_schema = '{DB_ATHENA}'
      AND table_name = 'links_parquet'
    ORDER BY ordinal_position
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,movieid,bigint,YES
1,imdbid,varchar,YES
2,tmdbid,bigint,YES


####  Comment

 - `movieID`:  BIGINT
 - `imdbID`: VARCHAR
 - `tmdbID`:  BIGINT
 
 - The column `is_nullable` is `YES` for all fields.  
    - This means the table allows null (NULL) values.  
    - Because the data was loaded from external files, there are no enforced NOT NULL constraints.

In [7]:
#see first 10 rows

run_sql("SELECT * FROM links_parquet LIMIT 10")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,imdbid,tmdbid
0,91548,1687247,66150
1,91554,154467,278822
2,91556,348572,54157
3,91558,113077,26824
4,91560,1706596,73448
5,91562,6886,56801
6,91564,190882,25653
7,91566,208178,104465
8,91571,1372686,101173
9,91573,1742023,103597


In [8]:
# Count the total number of rows in the links_parquet table
run_sql(f"""
    SELECT COUNT(*) AS total_rows
    FROM {DB_ATHENA}.links_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,total_rows
0,87585


In [9]:
# Count the number of missing values in each column of the links_parquet table
run_sql(f"""
    SELECT
        COUNT(*) - COUNT(movieid) AS missing_movieid,
        COUNT(*) - COUNT(imdbid)  AS missing_imdbid,
        COUNT(*) - COUNT(tmdbid)  AS missing_tmdbid
    FROM {DB_ATHENA}.links_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,missing_movieid,missing_imdbid,missing_tmdbid
0,0,0,124


#### Close connection

In [10]:
conn.close()
print("Athena's connection closed.")

Athena's connection closed.


### Comparison (Small)100K vs. 32M
- Both datasets have a minor proliferation of missing values: 
    - in both, all movieID from MovieLens have a IMDB id
    - in both, very few ids missing from The Movie DB (8 in 100K and 124 in 32M)