In [10]:
import pandas as pd

import os, sys
sys.path.append(os.path.abspath(".."))

from src.utils import get_athena_connection, read_sql_df
from src.config import DB_ATHENA


# ligação Athena
conn = get_athena_connection()
print(f"Connected with Athena base:'{DB_ATHENA}'")

def run_sql(sql: str) -> pd.DataFrame:
    """
    Executa SQL no Athena (MovieLens 1M) e devolve um DataFrame pandas.
    """
    return read_sql_df(sql, conn=conn)

Connected with Athena base:'movielens1m'


In [None]:
BASE = "s3://bdf25-20-movielens/curated/"


# 4) Verificação, agora em Athena
run_sql("SHOW TABLES")
run_sql("SELECT * FROM ratings_parquet LIMIT 5")


Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5.0,2000-12-31 22:12:40
1,1,661,3.0,2000-12-31 22:35:09
2,1,914,3.0,2000-12-31 22:32:48
3,1,3408,4.0,2000-12-31 22:04:35
4,1,2355,5.0,2001-01-06 23:38:11


In [3]:
run_sql("SHOW COLUMNS FROM ratings_parquet")

  df = pd.read_sql(sql, conn)


Unnamed: 0,field
0,user_id
1,movie_id
2,rating
3,rating_ts


In [4]:
run_sql(f"""
SELECT
    column_name,
    data_type,
    is_nullable
FROM information_schema.columns
WHERE table_schema = '{DB_ATHENA}'
  AND table_name   = 'ratings_parquet'
ORDER BY ordinal_position
""")

  df = pd.read_sql(sql, conn)


Unnamed: 0,column_name,data_type,is_nullable
0,user_id,integer,YES
1,movie_id,integer,YES
2,rating,double,YES
3,rating_ts,timestamp(3),YES


#### Comment

- `userId`: INTEGER  
- `movieId`: INTEGER  
- `rating`: DOUBLE  
- `timestamp`: TIMESTAMP WITH PRECISION OF MILISECONDS 

- The column `is_nullable` is `YES` for all fields.  
  - This means the table allows null (NULL) values.  
  - Because the data was loaded from external files, there are no enforced NOT NULL constraints.


In [5]:
#see first 10 rows

run_sql("SELECT * FROM ratings_parquet LIMIT 10")

  df = pd.read_sql(sql, conn)


Unnamed: 0,user_id,movie_id,rating,rating_ts
0,1,1193,5.0,2000-12-31 22:12:40
1,1,661,3.0,2000-12-31 22:35:09
2,1,914,3.0,2000-12-31 22:32:48
3,1,3408,4.0,2000-12-31 22:04:35
4,1,2355,5.0,2001-01-06 23:38:11
5,1,1197,3.0,2000-12-31 22:37:48
6,1,1287,5.0,2000-12-31 22:33:59
7,1,2804,5.0,2000-12-31 22:11:59
8,1,594,4.0,2000-12-31 22:37:48
9,1,919,4.0,2000-12-31 22:22:48


In [9]:
#Count number of missing values
run_sql("""
SELECT
    COUNT(*) - COUNT(movieId) AS missing_movieId,
    COUNT(*) - COUNT(title)   AS missing_title,
    COUNT(*) - COUNT(genres)  AS missing_genres
FROM ratings_parquet
""")

  df = pd.read_sql(sql, conn)


DatabaseError: Execution failed on sql: 
SELECT
    COUNT(*) - COUNT(movieId) AS missing_movieId,
    COUNT(*) - COUNT(title)   AS missing_title,
    COUNT(*) - COUNT(genres)  AS missing_genres
FROM ratings_parquet

COLUMN_NOT_FOUND: line 2:22: Column 'movieid' cannot be resolved or requester is not authorized to access requested resources
unable to rollback

In [18]:
#Identification of maximum, minimum values and counts of ratings
con.sql("""
SELECT
    MIN(userId)                  AS min_userId,
    MAX(userId)                  AS max_userId,
    COUNT(DISTINCT userId)       AS total_users,
    MIN(movieId)                 AS min_movieId,
    MAX(movieId)                 AS max_movieId,
    COUNT(DISTINCT movieId)      AS total_movies,
    MIN(rating)                  AS min_rating,
    MAX(rating)                  AS max_rating,
    AVG(rating)                  AS med_rating,
    MIN(timestamp)               AS min_timestamp,
    MAX(timestamp)               AS max_timestamp,
    COUNT(*)                     AS total_ratings
        
FROM ratings
""").df()

Unnamed: 0,min_userId,max_userId,total_users,min_movieId,max_movieId,total_movies,min_rating,max_rating,med_rating,min_timestamp,max_timestamp,total_ratings
0,1,330975,330975,1,288983,83239,0.5,5.0,3.54254,1995-01-09 12:46:44+01:00,2023-07-20 09:53:33+01:00,33832162


#### Comments

- There are 330,975 unique user IDs providing ratings.  
- A total of 83,239 movies have been evaluated.  
- Ratings range from 0.5 to 5.0, with an average value of 3.54254.  
- The earliest rating timestamp is from January 9, 1995, at 12:46:44+01:00.  
- The most recent rating timestamp is from July 7, 2023, at 09:53:33+01:00.  
- The dataset contains a total of 33,832,162 ratings (rows).


In [19]:
#Number of ratings and average rating per user
con.sql("""
SELECT
    userId,
    COUNT(*)              AS total_ratings,
    ROUND(AVG(rating), 2) AS media_rating
FROM ratings
GROUP BY userId
ORDER BY total_ratings DESC, media_rating DESC
""").df()



Unnamed: 0,userId,total_ratings,media_rating
0,189614,33332,3.08
1,48766,9554,2.57
2,207216,9178,3.28
3,175998,9016,3.18
4,76618,8919,2.54
...,...,...,...
330970,56881,1,0.50
330971,95533,1,0.50
330972,126444,1,0.50
330973,283799,1,0.50


#### Comentários
 - User 189,614 had rated 33332 movies, with an average rating of 3.08 . This means that if this user watched 1 movie a day, he/she would need 91 years to watch all these movies, thus there's probably a bot or an agencie behind this number. Also to mentioned that this number very far from the second to fourth user with around 9K movies rated.

In [20]:
#average ratings per movie ordered from best to worst rating
con.sql("""
SELECT
    m.title,
    ROUND(AVG(r.rating), 2) AS media_rating,
    COUNT(*)                AS total_ratings
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
ORDER BY media_rating DESC, total_ratings DESC
""").df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,title,media_rating,total_ratings
0,The Matrix Revolutions Revisited (2004),5.0,4
1,David Attenborough's Tasmania (2018),5.0,3
2,Sound of Christmas (2016),5.0,3
3,Awaken (2013),5.0,3
4,"Love, Kennedy (2017)",5.0,3
...,...,...,...
83038,The Cop Cam (2016),0.5,1
83039,Chainsaw Maid 2 (2010),0.5,1
83040,Yesterday (2018),0.5,1
83041,Akte Grüninger (2014),0.5,1


In [21]:
#average ratings per movie ordered from most rated to least rated
con.sql("""
SELECT
    m.title,
    ROUND(AVG(r.rating), 2) AS media_rating,
    COUNT(*)                AS total_ratings
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
ORDER BY total_ratings DESC, media_rating DESC
""").df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,title,media_rating,total_ratings
0,"Shawshank Redemption, The (1994)",4.42,122296
1,Forrest Gump (1994),4.07,113581
2,Pulp Fiction (1994),4.19,108756
3,"Matrix, The (1999)",4.16,107056
4,"Silence of the Lambs, The (1991)",4.15,101802
...,...,...,...
83038,Zombie Infection (2011),0.50,1
83039,Humanoids from Atlantis (1992),0.50,1
83040,Yesterday (2018),0.50,1
83041,Akte Grüninger (2014),0.50,1


In [25]:
con.sql("""
WITH counts AS (
  SELECT movieId, COUNT(*)::BIGINT AS n_ratings
  FROM ratings
  GROUP BY movieId
)
SELECT
  quantile_cont(n_ratings, 0.25) AS p25_ratings,
  quantile_cont(n_ratings, 0.50) AS median_ratings,
  quantile_cont(n_ratings, 0.75) AS p75_ratings,
  MIN(n_ratings) AS min_ratings,
  MAX(n_ratings) AS max_ratings,
  AVG(n_ratings)::DOUBLE AS mean_ratings
FROM counts
""").df()


Unnamed: 0,p25_ratings,median_ratings,p75_ratings,min_ratings,max_ratings,mean_ratings
0,2.0,5.0,26.0,1,122296,406.446041


### Conclusion

- Most movies have less than 26 ratings (q3), with a median of 5 ratings.

#### Fechar a ligação

In [26]:
con.close()
print("Ligação fechada.")

Ligação fechada.
