In [1]:
from pathlib import Path
import pandas as pd

def empilhar_csvs(pasta_csv: str, padrao: str = "*.csv") -> pd.DataFrame:
    """
    Lê todos os CSV de uma pasta (mesmo esquema de colunas) e concatena em um único DataFrame.
    Usa dtype_backend='pyarrow' para reduzir memória (pandas >= 2.0).
    """
    arquivos = sorted(Path(pasta_csv).glob(padrao))
    if not arquivos:
        raise FileNotFoundError(f"Nenhum CSV encontrado em {pasta_csv} com padrão {padrao}")

    # Leia e concatene como gerador (evita lista temporária grande)
    dfs = (
        pd.read_csv(
            arq,
            dtype_backend="pyarrow",  # mais econômico de memória
            low_memory=False
        )
        for arq in arquivos
    )
    df = pd.concat(dfs, ignore_index=True)
    return df

In [2]:
pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2021"

# empilhar todos os CSVs
df = empilhar_csvs(pasta)

In [3]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,2F0248F2E85771EA,electric_bike,2021-01-19 19:43:36.986,2021-01-19 19:45:50.414,Rivington St & Ridge St,5406.02,Allen St & Rivington St,5414.06,40.718502,-73.983299,40.720196,-73.989978,member
1,49985469DD6C5EC9,classic_bike,2021-01-29 06:38:32.423,2021-01-29 06:40:28.603,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member
2,E3B2362D59B6182D,classic_bike,2021-01-23 06:29:31.470,2021-01-23 06:31:18.457,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member
3,1C82E20D9DB94A58,classic_bike,2021-01-24 06:28:00.463,2021-01-24 06:29:52.060,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member
4,B82510C13F251703,classic_bike,2021-01-09 06:33:39.383,2021-01-09 06:35:31.520,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,A39CC1201D8C0CEC,electric_bike,2021-01-10 14:24:56.856,2021-01-10 14:26:50.676,Perry St & Bleecker St,5922.07,Greenwich Ave & Charles St,5914.08,40.735354,-74.004831,40.735238,-74.000271,member
96,697E9322C7FFA7E1,classic_bike,2021-01-10 14:42:25.690,2021-01-10 14:54:06.022,E 20 St & 2 Ave,5971.08,Mercer St & Spring St,5532.01,40.735877,-73.98205,40.723627,-73.999496,member
97,E488FDE9314E499C,classic_bike,2021-01-16 12:13:40.146,2021-01-16 12:26:01.597,E 20 St & 2 Ave,5971.08,Mercer St & Spring St,5532.01,40.735877,-73.98205,40.723627,-73.999496,member
98,98BB9DA836CDB8A8,classic_bike,2021-01-10 14:28:57.769,2021-01-10 14:37:32.795,Perry St & Bleecker St,5922.07,Mercer St & Spring St,5532.01,40.735354,-74.004831,40.723627,-73.999496,member


In [4]:
len(df)

27130122

In [5]:
df["started_at"] = df["started_at"].astype(str)
df["ended_at"] = df["ended_at"].astype(str)

In [6]:
# garantir tipo string leve
df["started_at"] = df["started_at"].astype("string[pyarrow]")
df["ended_at"]   = df["ended_at"].astype("string[pyarrow]")

# extrair apenas a parte da data (antes do espaço)
df["started_date"] = df["started_at"].str.partition(" ")[0]
df["ended_date"]   = df["ended_at"].str.partition(" ")[0]

# remover colunas originais
df.drop(columns=["started_at", "ended_at"], inplace=True)

In [7]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_date,ended_date
0,2F0248F2E85771EA,electric_bike,Rivington St & Ridge St,5406.02,Allen St & Rivington St,5414.06,40.718502,-73.983299,40.720196,-73.989978,member,2021-01-19,2021-01-19
1,49985469DD6C5EC9,classic_bike,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member,2021-01-29,2021-01-29
2,E3B2362D59B6182D,classic_bike,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member,2021-01-23,2021-01-23
3,1C82E20D9DB94A58,classic_bike,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member,2021-01-24,2021-01-24
4,B82510C13F251703,classic_bike,Clark St & Henry St,4789.03,Columbia Heights & Cranberry St,4829.01,40.697601,-73.993446,40.700379,-73.995481,member,2021-01-09,2021-01-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,A39CC1201D8C0CEC,electric_bike,Perry St & Bleecker St,5922.07,Greenwich Ave & Charles St,5914.08,40.735354,-74.004831,40.735238,-74.000271,member,2021-01-10,2021-01-10
96,697E9322C7FFA7E1,classic_bike,E 20 St & 2 Ave,5971.08,Mercer St & Spring St,5532.01,40.735877,-73.98205,40.723627,-73.999496,member,2021-01-10,2021-01-10
97,E488FDE9314E499C,classic_bike,E 20 St & 2 Ave,5971.08,Mercer St & Spring St,5532.01,40.735877,-73.98205,40.723627,-73.999496,member,2021-01-16,2021-01-16
98,98BB9DA836CDB8A8,classic_bike,Perry St & Bleecker St,5922.07,Mercer St & Spring St,5532.01,40.735354,-74.004831,40.723627,-73.999496,member,2021-01-10,2021-01-10


In [8]:
import duckdb

pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2021\*.csv"

con = duckdb.connect()

q = f"""
WITH trips AS (
  SELECT
    -- extrai a data diretamente do timestamp de texto
    CAST(strftime(started_at, '%Y-%m-%d') AS DATE) AS started_date,

    start_station_id,
    start_station_name,
    CAST(start_lat AS DOUBLE) AS start_lat,
    CAST(start_lng AS DOUBLE) AS start_lng,

    end_station_id,
    end_station_name,
    CAST(end_lat AS DOUBLE) AS end_lat,
    CAST(end_lng AS DOUBLE) AS end_lng
  FROM read_csv_auto('{pasta}',
        header = true,
        union_by_name = true,   -- junta colunas por nome entre arquivos
        SAMPLE_SIZE = -1        -- inspeciona todas as linhas para inferir tipos
  )
)
SELECT
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng,
  COUNT(*) AS total
FROM trips
GROUP BY
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng
ORDER BY started_date
"""

out = con.query(q).to_df()   # resultado final já pequeno cabe em pandas
print(out.head())


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  started_date start_station_id        start_station_name  start_lat  \
0   2019-04-14          5164.04       Broadway & Berry St  40.710446   
1   2019-05-07          7655.22      Lenox Ave & W 117 St  40.802557   
2   2019-05-17          4386.05  Fulton St & Clermont Ave  40.684157   
3   2019-05-17          5297.02  Vesey Pl & River Terrace  40.715338   
4   2019-06-24          4392.04   Putnam Ave & Throop Ave  40.685153   

   start_lng end_station_id            end_station_name    end_lat    end_lng  \
0 -73.965251        4732.04      Carlton Ave & Park Ave  40.695807 -73.973556   
1 -73.949078         SYS033                  Pier 40 X2  40.728487 -74.011693   
2 -73.969223        8198.05  Shakespeare Ave & W 169 St  40.839312 -73.922423   
3 -74.016584         SYS014      NYCBS DEPOT - DELANCEY  40.716444 -73.982331   
4 -73.941110         SYS033                  Pier 40 X2  40.728487 -74.011693   

   total  
0      1  
1      1  
2      1  
3      1  
4      1  


In [9]:
len(out)

18103307

In [10]:
from pathlib import Path

# caminho de saída
pasta = Path(r"C:\Riguel\Master\Clases\TFM\py\bike 2021")
arquivo_saida = pasta / "citibike_21.parquet"

# salvar em Parquet (requer pyarrow ou fastparquet)
out.to_parquet(arquivo_saida, index=False, engine="pyarrow")

print(f"✅ Arquivo Parquet salvo com sucesso em:\n{arquivo_saida}")

✅ Arquivo Parquet salvo com sucesso em:
C:\Riguel\Master\Clases\TFM\py\bike 2021\citibike_21.parquet
