### CSV vs PARQUET (Arrow) ###

- escalabilidade do formato

- usando DuckDB local

- com MovieLens 100k e 33M

In [2]:
# ===============================================
#   BLOCO INICIAL — IMPORTS & PATHS & SETUP
# ===============================================

import duckdb
import pandas as pd
import polars as pl
import time
from pathlib import Path

# === PATHS PARA OS DATASETS ===

## MovieLens 100k
DATA_100k = Path("..") / "data" / "100K"
ratings_100k = DATA_100k / "ratings.csv"
movies_100k  = DATA_100k / "movies.csv"
tags_100k    = DATA_100k / "tags.csv"
links_100k   = DATA_100k / "links.csv"

## MovieLens 33M
DATA_33m = Path("..") / "data" / "Full33M"
ratings_33m = DATA_33m / "ratings.csv"
movies_33m  = DATA_33m / "movies.csv"
tags_33m    = DATA_33m / "tags.csv"
links_33m   = DATA_33m / "links.csv"

print("Paths definidos com sucesso.")

# === Conexão DuckDB local (ficheiro persistente opcional) ===
con = duckdb.connect("movielens_local.duckdb")
print("Conexão DuckDB aberta.")

Paths definidos com sucesso.
Conexão DuckDB aberta.


In [3]:
print(ratings_100k)
print(ratings_33m)

..\data\100K\ratings.csv
..\data\Full33M\ratings.csv


In [4]:
# Convert from csv to parquet

duckdb.sql(f"""
COPY (SELECT * FROM '{ratings_100k}')
TO '{ratings_100k.with_suffix(".parquet")}'
(FORMAT 'parquet');
""")

duckdb.sql(f"""
COPY (SELECT * FROM '{ratings_33m}')
TO '{ratings_33m.with_suffix(".parquet")}'
(FORMAT 'parquet');
""")

print("Conversão concluída.")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Conversão concluída.


In [5]:
def medir_tempo(query):
    t0 = time.time()
    duckdb.sql(query).df()
    return round(time.time() - t0, 3)


In [6]:
# medicao real

tempos = {
    "CSV_100k": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_100k}'
        GROUP BY movieId
    """),

    "PARQUET_100k": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_100k.with_suffix('.parquet')}'
        GROUP BY movieId
    """),

    "CSV_33M": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_33m}'
        GROUP BY movieId
    """),

    "PARQUET_33M": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_33m.with_suffix('.parquet')}'
        GROUP BY movieId
    """),
}

tempos


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

{'CSV_100k': 0.124,
 'PARQUET_100k': 0.016,
 'CSV_33M': 2.292,
 'PARQUET_33M': 0.201}

In [7]:
# Tabela comparativa

df_tempo = pd.DataFrame([
    ["100k", tempos["CSV_100k"], tempos["PARQUET_100k"]],
    ["33M", tempos["CSV_33M"], tempos["PARQUET_33M"]],
], columns=["Dataset", "Tempo CSV (s)", "Tempo Parquet (s)"])

df_tempo


Unnamed: 0,Dataset,Tempo CSV (s),Tempo Parquet (s)
0,100k,0.124,0.016
1,33M,2.292,0.201


## Close the connection (when done)

In [None]:
con.close()
print("Connection closed.")