In [None]:
#================================================
# EXPLORING DATA
#================================================

import duckdb, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# cria ou liga se j√° existir:
con = duckdb.connect("movielens100K.duckdb")

In [None]:
# =================================================================
# Saber os 20 filmes com melhor nota (com pelo menos 50 classifica√ß√µes)
# =================================================================
df1 = con.sql("""
SELECT
    m.title,
    ROUND(AVG(r.rating), 2) AS media_rating,
    COUNT(*)                AS total_ratings
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
HAVING COUNT(*) > 50
ORDER BY media_rating DESC, total_ratings DESC
LIMIT 20
""").df()

df1


In [None]:
# Plot combinado: barras (m√©dia) + linha (n¬∫ ratings)
fig, ax1 = plt.subplots(figsize=(12, 6))

# Barras = m√©dia (AZUL)
ax1.bar(df1["title"], df1["media_rating"], color="skyblue")
ax1.set_ylabel("M√©dia de rating")
ax1.set_xticks(range(len(df1)))
ax1.set_xticklabels(df1["title"], rotation=45, ha="right")

# Linha = n¬∫ ratings (VERMELHO)
ax2 = ax1.twinx()
ax2.plot(range(len(df1)), df1["total_ratings"], marker="o", color="red", linewidth=2)
ax2.set_ylabel("N¬∫ de ratings")

plt.title("Top 20 filmes por m√©dia (‚â•50 ratings)\nBarras = m√©dia | Linha = n¬∫ ratings")
plt.tight_layout()
plt.show()



In [None]:
# =================================================================
# ranking dos 20 filmes com mais classifica√ß√µes (ratings)
# =================================================================

df2 = con.sql("""
SELECT
    m.title,
    COUNT(*) AS total_ratings,
    ROUND(AVG(r.rating), 2) AS media_rating
FROM ratings r
JOIN movies m USING (movieId)
GROUP BY m.title
ORDER BY total_ratings DESC, media_rating DESC
LIMIT 20
""").df()

df2




In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

# Barras = n¬∫ ratings (azul)
ax1.bar(df2["title"], df2["total_ratings"], color="skyblue")
ax1.set_ylabel("N¬∫ de ratings")
ax1.set_xticks(range(len(df2)))
ax1.set_xticklabels(df2["title"], rotation=45, ha="right")

# Linha = m√©dia (vermelho)
ax2 = ax1.twinx()
ax2.plot(range(len(df2)), df2["media_rating"], marker="o", color="red", linewidth=2)
ax2.set_ylabel("M√©dia de rating")

plt.title("Top 20 filmes com mais ratings\nBarras = n¬∫ ratings | Linha = rating")
plt.tight_layout()
plt.show()


In [None]:
# =================================================================
# ranking dos filmes por genero (com pelo menos 50 avalia√ß√µes)
# =================================================================
df3 = con.sql("""
WITH genero_filme AS (
    SELECT
        m.movieId,
        m.title,
        unnest(string_split(m.genres, '|')) AS genre
    FROM movies m
),
stats_genero_filme AS (
    SELECT
        gf.genre,
        gf.movieId,
        gf.title,
        COUNT(*) AS total_ratings,
        AVG(r.rating) AS media_rating
    FROM genero_filme gf
    JOIN ratings r
      ON r.movieId = gf.movieId
    GROUP BY gf.genre, gf.movieId, gf.title
    HAVING COUNT(*) >= 50          -- üëà m√≠nimo de 10 ratings
),
ranking_por_genero AS (
    SELECT
        genre,
        movieId,
        title,
        total_ratings,
        media_rating,
        ROW_NUMBER() OVER (
            PARTITION BY genre
            ORDER BY media_rating DESC, total_ratings DESC
        ) AS posicao
    FROM stats_genero_filme
)
SELECT
    genre,
    title,
    ROUND(media_rating, 2) AS media_rating,
    total_ratings
FROM ranking_por_genero
WHERE posicao = 1
ORDER BY genre
""").df()

df3

In [None]:
df3_plot = df3.sort_values("total_ratings", ascending=False)

x = np.arange(len(df3_plot))   # 0..N-1

fig, ax1 = plt.subplots(figsize=(18,10))
# Barras = m√©dia
ax1.bar(x, df3_plot["media_rating"], color="skyblue")
ax1.set_ylabel("M√©dia de rating")

# Linha = n¬∫ ratings
ax2 = ax1.twinx()
ax2.plot(x, df3_plot["total_ratings"], marker="o", color="red", linewidth=2)
ax2.set_ylabel("N¬∫ de ratings")

# G√©neros como labels verticais
plt.xticks(x, df3_plot["genre"], rotation=90)
plt.title("Melhor filme por g√©nero (‚â•50 ratings)\nBarras = m√©dia | Linha = n¬∫ ratings")

plt.tight_layout()
plt.show()


In [None]:
# =================================================================
# ranking dos filmes + avaliados por genero (com pelo menos 50 avalia√ß√µes)
# =================================================================

con.sql("""
WITH genero_filme AS (
    SELECT
        m.movieId,
        m.title,
        unnest(string_split(m.genres, '|')) AS genre
    FROM movies m
),
stats_genero_filme AS (
    SELECT
        gf.genre,
        gf.movieId,
        gf.title,
        COUNT(*) AS total_ratings,
        AVG(r.rating) AS media_rating
    FROM genero_filme gf
    JOIN ratings r
      ON r.movieId = gf.movieId
    GROUP BY gf.genre, gf.movieId, gf.title
    HAVING COUNT(*) >= 50  -- mant√©m s√≥ filmes com pelo menos 50 avalia√ß√µes
),
ranking_por_genero AS (
    SELECT
        genre,
        movieId,
        title,
        total_ratings,
        media_rating,
        ROW_NUMBER() OVER (
            PARTITION BY genre
            ORDER BY total_ratings DESC, media_rating DESC
        ) AS posicao
    FROM stats_genero_filme
)
SELECT
    genre,
--    posicao,
    title,
    total_ratings,
    ROUND(media_rating, 2) AS media_rating
FROM ranking_por_genero
WHERE posicao <= 1
ORDER BY genre, posicao
""").df()

In [None]:
# ================================================
# Saber os tags mais frequentes
# ================================================

con.sql("""
SELECT
    LOWER(TRIM(word)) AS palavra,
    COUNT(*) AS total
FROM (
    SELECT unnest(string_split(tag, ' ')) AS word
    FROM tags
)
WHERE palavra <> ''
  AND LENGTH(palavra) > 2
  AND palavra NOT IN ('the', 'and', 'for', 'with', 'this', 'that', 'are')
GROUP BY palavra
ORDER BY total DESC
LIMIT 30
""").df()

In [None]:
# ================================================
# Saber o tag mais repetido por filme:
# ================================================

df_tags=con.sql("""
WITH palavras_por_filme AS (
    SELECT
        t.movieId,
        LOWER(TRIM(word)) AS palavra,
        COUNT(*) AS total
    FROM (
        SELECT movieId, unnest(string_split(tag, ' ')) AS word
        FROM tags
    ) t
    WHERE palavra <> ''
      AND LENGTH(palavra) > 2
      AND palavra NOT IN ('the', 'and', 'for', 'with', 'this', 'that', 'are', 'was')
    GROUP BY t.movieId, palavra
),
ranking_por_filme AS (
    SELECT
        movieId,
        palavra,
        total,
        ROW_NUMBER() OVER (PARTITION BY movieId ORDER BY total DESC) AS posicao
    FROM palavras_por_filme
)
SELECT
    m.movieId,
    m.title,
    r.palavra,
    r.total
FROM ranking_por_filme r
JOIN movies m USING (movieId)
WHERE r.posicao <= 5
ORDER BY r.total DESC
LIMIT 20
""").df()

df_tags

In [None]:
# Top 10 palavras
df_top = df_tags.sort_values("total", ascending=False).head(20)

plt.figure(figsize=(16,8))
plt.bar(df_top["palavra"], df_top["total"])
plt.title("Top 20 palavras mais usadas em tags")
plt.ylabel("N¬∫ de ocorr√™ncias")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# ================================================
# Saber os users com mais qt de ratings e nota m√©dia
# ================================================

con.sql("""
SELECT
    r.userId,
    COUNT(*) AS total_ratings,
    ROUND(AVG(r.rating), 2) AS media_rating
FROM ratings r
GROUP BY r.userId
ORDER BY total_ratings DESC, media_rating DESC
LIMIT 20
""").df()

In [None]:
df_users = con.sql("""
SELECT
    userId,
    COUNT(*) AS total_ratings,
    ROUND(AVG(rating), 2) AS media_rating
FROM ratings
GROUP BY userId
""").df()


In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14,6))

# hist n¬∫ ratings
axes[0].hist(df_users["total_ratings"], bins=30)
axes[0].set_title("Distribui√ß√£o do n¬∫ de ratings por user")
axes[0].set_xlabel("n¬∫ ratings")
axes[0].set_ylabel("freq")

# hist m√©dia ratings
axes[1].hist(df_users["media_rating"], bins=30)
axes[1].set_title("Distribui√ß√£o da m√©dia de rating por user")
axes[1].set_xlabel("m√©dia")
axes[1].set_ylabel("freq")

plt.tight_layout()
plt.show()


#### Fechar a liga√ß√£o

In [None]:
con.close()
print("Liga√ß√£o fechada.")