In [1]:
import pandas as pd
import os, sys

sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# Athena connection
conn = get_athena_connection()
print(f"Connected to Athena database:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Execute a SQL query on Athena (MovieLens 32M) and return a Pandas DataFrame.
    """
    return read_sql_df(sql, conn=conn)

Connected to Athena database:'movielens32m'


In [2]:
# 4) Verification in Athena

# Preview the tags_parquet table
run_sql(f"SELECT * FROM {DB_ATHENA}.tags_parquet LIMIT 5")

  df = pd.read_sql(sql, conn)


Unnamed: 0,userid,movieid,tag,timestamp
0,78213,50954,school,2018-06-07 05:42:37
1,78213,50954,soul transference,2018-06-07 05:42:37
2,78213,50954,spell,2018-06-07 05:42:37
3,78213,50954,teen comedy,2018-06-07 05:42:37
4,78213,50954,teenager,2018-06-07 05:42:37


In [3]:
run_sql("SHOW COLUMNS FROM tags_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,field
0,userid
1,movieid
2,tag
3,timestamp


In [4]:
# Inspect the structure of the tags_parquet table (columns and data types)
run_sql(f"""
    SELECT
        column_name,
        data_type,
        is_nullable
    FROM information_schema.columns
    WHERE table_schema = '{DB_ATHENA}'
      AND table_name = 'tags_parquet'
    ORDER BY ordinal_position
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,userid,integer,YES
1,movieid,integer,YES
2,tag,varchar,YES
3,timestamp,timestamp(3),YES


### Comment

 - `userID`: INT
 - `movieID`: INT
 - `tag`: VARCHAR
 - `timestamp`: TIMESTAMP WITH PRECISION OF MILISECONDS 

- The column `is_nullable` is `YES` for all fields.  
  - This means the table allows null (NULL) values.  
  - Because the data was loaded from external files, there are no enforced NOT NULL constraints.


In [5]:
# Preview the first 10 rows of the tags_parquet table
run_sql(f"""
    SELECT *
    FROM {DB_ATHENA}.tags_parquet
    LIMIT 10
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,userid,movieid,tag,timestamp
0,78213,50954,school,2018-06-07 05:42:37
1,78213,50954,soul transference,2018-06-07 05:42:37
2,78213,50954,spell,2018-06-07 05:42:37
3,78213,50954,teen comedy,2018-06-07 05:42:37
4,78213,50954,teenager,2018-06-07 05:42:37
5,78213,50970,lgbt,2018-05-26 22:26:05
6,78213,50977,pimp,2018-05-25 03:57:32
7,78213,50977,prostitute,2018-05-25 03:57:32
8,78213,50977,vice,2018-05-25 03:57:32
9,78213,51004,fear,2018-06-09 23:00:24


In [6]:
# Count the total number of rows in the tags_parquet table
run_sql(f"""
    SELECT COUNT(*) AS total_rows
    FROM {DB_ATHENA}.tags_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,total_rows
0,2000072


In [7]:
# Count the number of missing values in each column of the tags_parquet table
run_sql(f"""
    SELECT
        COUNT(*) - COUNT(userid)    AS missing_userid,
        COUNT(*) - COUNT(movieid)   AS missing_movieid,
        COUNT(*) - COUNT(tag)       AS missing_tag,
        COUNT(*) - COUNT(timestamp) AS missing_timestamp
    FROM {DB_ATHENA}.tags_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,missing_userid,missing_movieid,missing_tag,missing_timestamp
0,0,0,0,0


#### Close connection

In [8]:
conn.close()
print("Athena's connection closed.")

Athena's connection closed.
