In [1]:
import pandas as pd
import os, sys

sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# Athena connection
conn = get_athena_connection()
print(f"Connected to Athena database:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Execute a SQL query on Athena (MovieLens 32M) and return a Pandas DataFrame.
    """
    return read_sql_df(sql, conn=conn)

Connected to Athena database:'movielens32m'


In [2]:
# 4) Verification in Athena

# Show all tables in movielens32m
run_sql(f"SHOW TABLES IN {DB_ATHENA}")

# Preview the ratings_parquet table
run_sql(f"SELECT * FROM {DB_ATHENA}.ratings_parquet LIMIT 5")


  df = pd.read_sql(sql, conn)


Unnamed: 0,userid,movieid,rating,timestamp
0,124206,2600,3.0,2000-08-01 05:56:41
1,124206,2605,2.0,2000-08-01 05:44:59
2,124206,2676,2.0,2000-08-01 05:48:35
3,124206,2692,5.0,2000-08-01 05:52:24
4,124206,2707,4.0,2000-08-01 05:39:21


In [3]:
run_sql("SHOW COLUMNS FROM ratings_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,field
0,userid
1,movieid
2,rating
3,timestamp


In [4]:
run_sql(f"""
SELECT
    column_name,
    data_type,
    is_nullable
FROM information_schema.columns
WHERE table_schema = '{DB_ATHENA}'
  AND table_name   = 'ratings_parquet'
ORDER BY ordinal_position
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,userid,integer,YES
1,movieid,integer,YES
2,rating,double,YES
3,timestamp,timestamp(3),YES


#### Comment

- `userId`: INTEGER  
- `movieId`: INTEGER  
- `rating`: DOUBLE  
- `timestamp`: TIMESTAMP WITH PRECISION OF MILISECONDS 

- The column `is_nullable` is `YES` for all fields.  
  - This means the table allows null (NULL) values.  
  - Because the data was loaded from external files, there are no enforced NOT NULL constraints.


In [5]:
#see first 10 rows

run_sql("SELECT * FROM ratings_parquet LIMIT 10")

  df = pd.read_sql(sql, conn)


Unnamed: 0,userid,movieid,rating,timestamp
0,176974,141,3.0,1997-05-25 16:09:46
1,176974,145,3.0,1997-02-26 16:07:57
2,176974,150,3.0,1997-06-17 13:49:27
3,176974,163,3.0,1997-02-26 15:25:04
4,176974,164,3.0,1997-02-26 15:22:44
5,176974,165,1.0,1997-02-26 15:26:44
6,176974,151,3.0,1997-02-26 15:49:11
7,176974,153,3.0,1997-06-17 13:55:33
8,176974,159,3.0,1997-02-26 15:19:28
9,176974,161,3.0,1997-02-26 15:20:26


In [7]:
#Count number of missing values
run_sql("""
SELECT
    COUNT(*) - COUNT(userId)    AS missing_userId,
    COUNT(*) - COUNT(movieId)   AS missing_movieId,
    COUNT(*) - COUNT(rating)    AS missing_rating,
    COUNT(*) - COUNT(timestamp) AS missing_timestamp
FROM ratings_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,missing_userId,missing_movieId,missing_rating,missing_timestamp
0,0,0,0,0


In [8]:
#Identification of maximum, minimum values and counts of ratings
run_sql("""
SELECT
    MIN(userId)                AS min_userId,
    MAX(userId)                AS max_userId,
    COUNT(DISTINCT userId)     AS total_users,
    MIN(movieId)               AS min_movieId,
    MAX(movieId)               AS max_movieId,
    COUNT(DISTINCT movieId)    AS total_movies,
    MIN(rating)                AS min_rating,
    MAX(rating)                AS max_rating,
    AVG(rating)                AS avg_rating,
    MIN(timestamp)             AS min_timestamp,
    MAX(timestamp)             AS max_timestamp,
    COUNT(*)                   AS total_ratings
FROM ratings_parquet
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,min_userId,max_userId,total_users,min_movieId,max_movieId,total_movies,min_rating,max_rating,avg_rating,min_timestamp,max_timestamp,total_ratings
0,1,200948,200948,1,292757,84432,0.5,5.0,3.540396,1995-01-09 11:46:44,2023-10-13 02:29:07,32000204


#### Comments

- There are 200.948 unique user IDs providing ratings.  
- A total of 84,432 movies have been evaluated.  
- Ratings range from 0.5 to 5.0, with an average value of 3.540396.  
- The earliest rating timestamp is from january 9, 1995, at 11:46:44.  
- The most recent rating timestamp is from october 13, 2023, at 02:29:07.  
- The dataset contains a total of 32,000,204 ratings (rows).


In [9]:
#Number of ratings and average rating per user
run_sql("""
SELECT
    userId,
    COUNT(*)              AS total_ratings,
    ROUND(AVG(rating), 2) AS avg_rating
FROM ratings_parquet
GROUP BY userId
ORDER BY total_ratings DESC, avg_rating DESC
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,userId,total_ratings,avg_rating
0,175325,33332,3.08
1,17035,9577,2.57
2,55653,9178,3.28
3,123465,9044,2.53
4,171795,9016,3.18
...,...,...,...
200943,5333,20,0.50
200944,64896,20,0.50
200945,135854,20,0.50
200946,56314,20,0.50


In [10]:
run_sql("""
SELECT COUNT(*) AS users_with_one_rating
FROM (
    SELECT userId
    FROM ratings_parquet
    GROUP BY userId
    HAVING COUNT(*) = 1
)
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,users_with_one_rating
0,0


In [11]:
run_sql("SELECT COUNT(*) FROM ratings_parquet")


  df = pd.read_sql(sql, conn)


Unnamed: 0,_col0
0,32000204


In [13]:
run_sql("""
SELECT MIN(cnt) AS min_ratings_per_user
FROM (
    SELECT COUNT(*) AS cnt
    FROM ratings_parquet
    GROUP BY userId
)
""")



Unnamed: 0,min_ratings_per_user
0,20


In [14]:
run_sql("""
SELECT COUNT(*) AS users_below_20
FROM (
    SELECT userId, COUNT(*) AS cnt
    FROM ratings_parquet
    GROUP BY userId
)
WHERE cnt < 20
""")


  df = pd.read_sql(sql, conn)


Unnamed: 0,users_below_20
0,0


In [15]:
run_sql("SELECT COUNT(DISTINCT userId) FROM ratings_parquet")


  df = pd.read_sql(sql, conn)


Unnamed: 0,_col0
0,200948


#### Comments
While exploring the data, we noticed something unexpected: there is not a single user with fewer than 20 ratings.
This is unusual, because our understanding of the official MovieLens datasets — including the smaller 100K version — is that they typically contain many users who rated only a handful of movies, sometimes just one.

We verified that our table is complete and contains the full 32,000,204 ratings, with no missing or truncated data.
To investigate further, we performed a targeted analysis to count how many users had fewer than 20 ratings, and the result was consistently zero across all thresholds.

This confirms that the dataset we are using is apparently a filtered variant of MovieLens 32M, in which all users with fewer than 20 ratings were removed before distribution.
The dataset is therefore internally consistent, but it does not include the long tail of low-activity users present in the original GroupLens release.

In [17]:
run_sql("SHOW COLUMNS FROM movies_parquet")

Unnamed: 0,field
0,movieid
1,title
2,genres


In [19]:
# Average rating per movie ordered from best to worst rating
run_sql("""
SELECT
    m.movieid,
    m.title,
    ROUND(AVG(r.rating), 2) AS avg_rating,
    COUNT(*)                AS total_ratings
FROM ratings_parquet AS r
JOIN movies_parquet  AS m
  ON r.movieid = m.movieid
GROUP BY m.movieid, m.title
ORDER BY avg_rating DESC, total_ratings DESC
""")


Unnamed: 0,movieid,title,avg_rating,total_ratings
0,234089,"Love, Kennedy (2017)",5.0,4
1,202936,ReMoved (2013),5.0,3
2,179731,Sound of Christmas (2016),5.0,3
3,200016,The Nagano Tapes (2018),5.0,3
4,165787,Lonesome Dove Church (2014),5.0,3
...,...,...,...,...
84427,248050,Howl (2021),0.5,1
84428,274633,Respect the Jux (2022),0.5,1
84429,163288,Those Three French Girls (1930),0.5,1
84430,248972,June & Kopi (2021),0.5,1


In [20]:
# Average rating per movie ordered from most rated to least rated
run_sql("""
SELECT
    m.movieid,
    m.title,
    ROUND(AVG(r.rating), 2) AS avg_rating,
    COUNT(*)                AS total_ratings
FROM ratings_parquet r
JOIN movies_parquet  m
  ON r.movieid = m.movieid
GROUP BY m.movieid, m.title
ORDER BY total_ratings DESC, avg_rating DESC
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,movieid,title,avg_rating,total_ratings
0,318,"Shawshank Redemption, The (1994)",4.40,102929
1,356,Forrest Gump (1994),4.05,100296
2,296,Pulp Fiction (1994),4.20,98409
3,2571,"Matrix, The (1999)",4.16,93808
4,593,"Silence of the Lambs, The (1991)",4.15,90330
...,...,...,...,...
84427,128699,The Virgin and the Gypsy (1970),0.50,1
84428,179807,Generation Zero (2010),0.50,1
84429,287621,The Devil's Sword (1983),0.50,1
84430,134769,Way Down South (1939),0.50,1


In [21]:
# Distribution of the number of ratings per movie (percentiles and summary stats)
run_sql("""
WITH counts AS (
    SELECT
        movieid,
        COUNT(*) AS n_ratings
    FROM ratings_parquet
    GROUP BY movieid
)
SELECT
    approx_percentile(n_ratings, 0.25)          AS p25_ratings,
    approx_percentile(n_ratings, 0.50)          AS median_ratings,
    approx_percentile(n_ratings, 0.75)          AS p75_ratings,
    MIN(n_ratings)                              AS min_ratings,
    MAX(n_ratings)                              AS max_ratings,
    AVG(CAST(n_ratings AS DOUBLE))              AS mean_ratings
FROM counts
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,p25_ratings,median_ratings,p75_ratings,min_ratings,max_ratings,mean_ratings
0,2,5,25,1,102929,379.005638


### Conclusion

- Most movies have less than 25 ratings (q3), with a median of 5 ratings.

#### Close connection

In [22]:
conn.close()
print("Athena's connection closed.")

Athena's connection closed.
