In [11]:
#================================================
# EXPLORAÇÃO INICIAL DA TABELA "ratings"
#================================================

import duckdb, pandas as pd
from pathlib import Path

# cria ou liga se já existir:
con = duckdb.connect("movielens100K.duckdb")

In [12]:
con.sql("DESCRIBE ratings").df()


Unnamed: 0,column_name,column_type,null,key,default,extra
0,userId,INTEGER,YES,,,
1,movieId,INTEGER,YES,,,
2,rating,DOUBLE,YES,,,
3,timestamp,TIMESTAMP WITH TIME ZONE,YES,,,


#### Comentário

 - "userID": INTEGER;
 - "movieID":  INTEGER;
 - "rating": DOUBLE
 - "timestamp":  Timestamp with time zone;

 - Coluna "null": indica se a coluna pode conter valores nulos (NULL).
 - Neste caso pode.

 - Coluna "key": indica se coluna é chave primária (PRIMARY KEY).
 - Não faz.

 - Coluna "default": indica se mostra o valor por defeito (DEFAULT).
 - Não tem

 - Coluna "extra": mostra informações adicionais sobre a coluna, como: auto_increment ou generated.
 - Neste caso não tem

In [13]:
#Ver o tipo de dados de cada coluna
con.sql("PRAGMA table_info('ratings')").df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,userId,INTEGER,False,,False
1,1,movieId,INTEGER,False,,False
2,2,rating,DOUBLE,False,,False
3,3,timestamp,TIMESTAMP WITH TIME ZONE,False,,False


In [14]:
#Ver primeiras 10 linhas
con.sql("SELECT * FROM ratings LIMIT 10;").df()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 19:45:03+01:00
1,1,3,4.0,2000-07-30 19:20:47+01:00
2,1,6,4.0,2000-07-30 19:37:04+01:00
3,1,47,5.0,2000-07-30 20:03:35+01:00
4,1,50,5.0,2000-07-30 19:48:51+01:00
5,1,70,3.0,2000-07-30 19:40:00+01:00
6,1,101,5.0,2000-07-30 19:14:28+01:00
7,1,110,4.0,2000-07-30 19:36:16+01:00
8,1,151,5.0,2000-07-30 20:07:21+01:00
9,1,157,5.0,2000-07-30 20:08:20+01:00


In [15]:
#Contar o número de missing values
con.sql("""
SELECT
    COUNT(*) - COUNT(userId)   AS missing_userId,
    COUNT(*) - COUNT(movieId)  AS missing_movieId,
    COUNT(*) - COUNT(rating)   AS missing_rating,
    COUNT(*) - COUNT(timestamp) AS missing_timestamp
FROM ratings
""").df()


Unnamed: 0,missing_userId,missing_movieId,missing_rating,missing_timestamp
0,0,0,0,0


In [16]:
#Identificação dos valores máximos, mínimos e contagens das avaliações 
con.sql("""
SELECT
    MIN(userId)                  AS min_userId,
    MAX(userId)                  AS max_userId,
    COUNT(DISTINCT userId)       AS total_users,
    MIN(movieId)                 AS min_movieId,
    MAX(movieId)                 AS max_movieId,
    COUNT(DISTINCT movieId)      AS total_movies,
    MIN(rating)                  AS min_rating,
    MAX(rating)                  AS max_rating,
    AVG(rating)                  AS med_rating,
    MIN(timestamp)               AS min_timestamp,
    MAX(timestamp)               AS max_timestamp,
    COUNT(*)                     AS total_ratings
        
FROM ratings
""").df()

Unnamed: 0,min_userId,max_userId,total_users,min_movieId,max_movieId,total_movies,min_rating,max_rating,med_rating,min_timestamp,max_timestamp,total_ratings
0,1,610,610,1,193609,9724,0.5,5.0,3.501557,1996-03-29 19:36:55+01:00,2018-09-24 15:27:30+01:00,100836


#### Comentários
 - Temos 610 userID a comentar.
 - 9724 movies evaluated
 - Rating between 0,5 and 5,0 and mean value of 3,50155
 - O time Stamp mais antigo de uma classificação foi em 29 Marços de 1996 às 19.36.55s
 - Timestamp mais recente em 2018 24 de Setembro às 15:27 e 30 minutos.
 - Temos um total de 100836 ratings (linhas).

In [17]:
#Número de ratings e nota média por utilizador
con.sql("""
SELECT
    userId,
    COUNT(*)              AS total_ratings,
    ROUND(AVG(rating), 2) AS media_rating
FROM ratings
GROUP BY userId
ORDER BY total_ratings DESC, media_rating DESC
""").df()



Unnamed: 0,userId,total_ratings,media_rating
0,414,2698,3.39
1,599,2478,2.64
2,474,2108,3.40
3,448,1864,2.85
4,274,1346,3.24
...,...,...,...
605,257,20,3.20
606,576,20,3.10
607,207,20,2.88
608,431,20,2.73


#### Comentários
 - User 414 foi o que mais classificou com 2698 ratings e um valor médio de 3.39 por rating.
 - Depois temos 6 User com 20 classificações cada um e com uma média entre 1.725 e 5.0

In [18]:
#Classificações médias por filme ordenados da melhor para a pior nota 
con.sql("""
SELECT
    m.title,
    ROUND(AVG(r.rating), 2) AS media_rating,
    COUNT(*)                AS total_ratings
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
ORDER BY media_rating DESC, total_ratings DESC
""").df()


Unnamed: 0,title,media_rating,total_ratings
0,Lamerica (1994),5.0,2
1,Jonah Who Will Be 25 in the Year 2000 (Jonas q...,5.0,2
2,Enter the Void (2009),5.0,2
3,Lesson Faust (1994),5.0,2
4,Heidi Fleiss: Hollywood Madam (1995),5.0,2
...,...,...,...
9714,Superfast! (2015),0.5,1
9715,Arthur Christmas (2011),0.5,1
9716,Dead of Night (1945),0.5,1
9717,Derailed (2002),0.5,1


In [19]:
#Classificações médias por filme ordenados do mais classificado para o menor
con.sql("""
SELECT
    m.title,
    ROUND(AVG(r.rating), 2) AS media_rating,
    COUNT(*)                AS total_ratings
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
ORDER BY total_ratings DESC, media_rating DESC
""").df()


Unnamed: 0,title,media_rating,total_ratings
0,Forrest Gump (1994),4.16,329
1,"Shawshank Redemption, The (1994)",4.43,317
2,Pulp Fiction (1994),4.20,307
3,"Silence of the Lambs, The (1991)",4.16,279
4,"Matrix, The (1999)",4.19,278
...,...,...,...
9714,The Emoji Movie (2017),0.50,1
9715,"Haunted House 2, A (2014)",0.50,1
9716,Oblivion 2: Backlash (1996),0.50,1
9717,Iron Man (1931),0.50,1


#### Fechar a ligação

In [20]:
con.close()
print("Ligação fechada.")

Ligação fechada.
