### 1 Introduction

**CSV vs PARQUET (Arrow)**
- escalabilidade do formato
- usando DuckDB local
- com MovieLens 100k e 33M

#### 2 Library import, Data and files import, duckdb import and conversion of files

In [8]:
#   BLOCO INICIAL — IMPORTS & PATHS & SETUP

import duckdb
import pandas as pd
import polars as pl
import time
from pathlib import Path

# === PATHS PARA OS DATASETS ===

## MovieLens 100k
DATA_100k = Path("..") / "data" / "100K"
ratings_100k = DATA_100k / "ratings.csv"
movies_100k  = DATA_100k / "movies.csv"
tags_100k    = DATA_100k / "tags.csv"
links_100k   = DATA_100k / "links.csv"

## MovieLens 33M
DATA_33m = Path("..") / "data" / "Full33M"
ratings_33m = DATA_33m / "ratings.csv"
movies_33m  = DATA_33m / "movies.csv"
tags_33m    = DATA_33m / "tags.csv"
links_33m   = DATA_33m / "links.csv"

print("Paths definidos com sucesso.")

# === Conexão DuckDB local (ficheiro persistente opcional) ===
con = duckdb.connect("movielens_local.duckdb")
print("Conexão DuckDB aberta.")

Paths definidos com sucesso.
Conexão DuckDB aberta.


In [4]:
# List of all tables in the database
con.sql("""
SELECT table_name, table_type
FROM information_schema.tables
""").df()

Unnamed: 0,table_name,table_type


In [5]:
print(ratings_100k)
print(ratings_33m)

..\data\100K\ratings.csv
..\data\Full33M\ratings.csv


In [6]:
# Convert from csv to parquet

duckdb.sql(f"""
COPY (SELECT * FROM '{ratings_100k}')
TO '{ratings_100k.with_suffix(".parquet")}'
(FORMAT 'parquet');
""")

duckdb.sql(f"""
COPY (SELECT * FROM '{ratings_33m}')
TO '{ratings_33m.with_suffix(".parquet")}'
(FORMAT 'parquet');
""")

print("Conversão concluída.")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Conversão concluída.


#### 3 Performance test and measuremnts
##### 3.1 Sub to measure time on queries

In [9]:
def medir_tempo(query):
    t0 = time.time()
    duckdb.sql(query).df()
    return round(time.time() - t0, 3)

##### 3.2 Test with simple Query in file with movieId: csv 100k vs parque 100k and csv with 33M vs parquet with 33M

In [10]:
# medicao real

tempos = {
    "CSV_100k": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_100k}'
        GROUP BY movieId
    """),

    "PARQUET_100k": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_100k.with_suffix('.parquet')}'
        GROUP BY movieId
    """),

    "CSV_33M": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_33m}'
        GROUP BY movieId
    """),

    "PARQUET_33M": medir_tempo(f"""
        SELECT movieId, AVG(rating)
        FROM '{ratings_33m.with_suffix('.parquet')}'
        GROUP BY movieId
    """),
}

tempos


{'CSV_100k': 0.065,
 'PARQUET_100k': 0.014,
 'CSV_33M': 1.045,
 'PARQUET_33M': 0.184}

In [11]:
# Tabela comparativa

df_tempo = pd.DataFrame([
    ["100k", tempos["CSV_100k"], tempos["PARQUET_100k"]],
    ["33M", tempos["CSV_33M"], tempos["PARQUET_33M"]],
], columns=["Dataset", "Tempo CSV (s)", "Tempo Parquet (s)"])

df_tempo


Unnamed: 0,Dataset,Tempo CSV (s),Tempo Parquet (s)
0,100k,0.065,0.014
1,33M,1.045,0.184


__Close the connection (when done)__

In [12]:
#con.close()
#print("Connection closed.")

### DuckDB vs Polars ###

- mesmo cálculo

- em DuckDB e Polars

- usando Parquet apenas (formato colunar ideal)

In [None]:
# paths para Parquet

p100k = ratings_100k.with_suffix(".parquet")
p33m = ratings_33m.with_suffix(".parquet")

In [None]:
# Funções de benchmark

def run_duckdb(path):
    t0 = time.time()
    duckdb.sql(f"""
        SELECT movieId, AVG(rating)
        FROM '{path}'
        GROUP BY movieId
    """).df()
    return round(time.time() - t0, 3)

def run_polars(path):
    t0 = time.time()
    (
        pl.scan_parquet(str(path))     
          .group_by("movieId")         
          .agg(pl.col("rating").mean())
          .collect()
    )
    return round(time.time() - t0, 3)


In [None]:
# Comparação

df_duck_polars = pd.DataFrame([
    ["100k", run_duckdb(p100k), run_polars(p100k)],
    ["33m", run_duckdb(p33m), run_polars(p33m)],
], columns=["Dataset", "DuckDB (s)", "Polars (s)"])

df_duck_polars


Unnamed: 0,Dataset,DuckDB (s),Polars (s)
0,100k,0.005,0.008
1,33m,0.178,0.502
