In [1]:
from pathlib import Path
import pandas as pd

def empilhar_csvs(pasta_csv: str, padrao: str = "*.csv") -> pd.DataFrame:
    """
    Lê todos os CSV de uma pasta (mesmo esquema de colunas) e concatena em um único DataFrame.
    Usa dtype_backend='pyarrow' para reduzir memória (pandas >= 2.0).
    """
    arquivos = sorted(Path(pasta_csv).glob(padrao))
    if not arquivos:
        raise FileNotFoundError(f"Nenhum CSV encontrado em {pasta_csv} com padrão {padrao}")

    # Leia e concatene como gerador (evita lista temporária grande)
    dfs = (
        pd.read_csv(
            arq,
            dtype_backend="pyarrow",  # mais econômico de memória
            low_memory=False
        )
        for arq in arquivos
    )
    df = pd.concat(dfs, ignore_index=True)
    return df

In [None]:
pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2022"

# empilhar todos os CSVs
df = empilhar_csvs(pasta)

In [3]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,A8518A6C4BE513DE,classic_bike,2023-01-03 23:14:52.325,2023-01-03 23:33:42.737,E 1 St & Bowery,5636.13,Spruce St & Nassau St,5137.10,40.724861,-73.992131,40.711464,-74.005524,casual
1,A3911E4F5B9B5773,electric_bike,2023-01-07 07:57:40.054,2023-01-07 08:01:27.330,E 1 St & Bowery,5636.13,Ave A & E 11 St,5703.13,40.724861,-73.992131,40.728547,-73.981759,casual
2,AE7F74C32AEBF6F2,electric_bike,2023-01-09 18:37:44.830,2023-01-09 18:48:56.233,1 Ave & E 39 St,6303.01,E 14 St & 1 Ave,5779.10,40.74714,-73.97113,40.731393,-73.982867,member
3,6E10997509D2B7F6,electric_bike,2023-01-05 19:06:15.350,2023-01-05 19:08:33.547,E Burnside Ave & Ryer Ave,8397.02,E Burnside Ave & Ryer Ave,8397.02,40.850535,-73.901318,40.850535,-73.901318,casual
4,AA546E74A9330BD4,electric_bike,2023-01-02 20:25:23.300,2023-01-03 10:51:25.164,Clermont Ave & Park Ave,4692.01,Clermont Ave & Park Ave,4692.01,40.695734,-73.971297,40.695734,-73.971297,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6A338EC25B9FB134,electric_bike,2023-01-06 11:37:33.657,2023-01-06 11:42:55.072,Alexander Ave & E 134 St,7712.04,Brook Ave & E 138 St,7707.08,40.807466,-73.927107,40.807408,-73.91924,member
96,74CDB1D3CC2F6AC8,electric_bike,2023-01-07 09:41:17.927,2023-01-07 09:48:39.599,Riverside Dr & W 82 St,7388.1,Riverside Dr & W 104 St,7623.13,40.787209,-73.981281,40.801343,-73.971146,member
97,1F552880DF9D1243,classic_bike,2023-01-11 08:31:00.282,2023-01-11 08:38:46.392,5 Ave & 3 St,3987.06,Berkeley Pl & 7 Ave,4051.01,40.672815,-73.983524,40.675147,-73.975232,member
98,190D82A72B925ECE,electric_bike,2023-01-14 13:54:06.855,2023-01-14 13:56:08.216,2 Ave & E 122 St,7622.12,E 123 St & Lexington Ave,7636.05,40.800672,-73.9349,40.802926,-73.9379,member


In [4]:
len(df)

35106986

In [5]:
df["started_at"] = df["started_at"].astype(str)
df["ended_at"] = df["ended_at"].astype(str)

In [6]:
# garantir tipo string leve
df["started_at"] = df["started_at"].astype("string[pyarrow]")
df["ended_at"]   = df["ended_at"].astype("string[pyarrow]")

# extrair apenas a parte da data (antes do espaço)
df["started_date"] = df["started_at"].str.partition(" ")[0]
df["ended_date"]   = df["ended_at"].str.partition(" ")[0]

# remover colunas originais
df.drop(columns=["started_at", "ended_at"], inplace=True)

In [7]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_date,ended_date
0,A8518A6C4BE513DE,classic_bike,E 1 St & Bowery,5636.13,Spruce St & Nassau St,5137.10,40.724861,-73.992131,40.711464,-74.005524,casual,2023-01-03,2023-01-03
1,A3911E4F5B9B5773,electric_bike,E 1 St & Bowery,5636.13,Ave A & E 11 St,5703.13,40.724861,-73.992131,40.728547,-73.981759,casual,2023-01-07,2023-01-07
2,AE7F74C32AEBF6F2,electric_bike,1 Ave & E 39 St,6303.01,E 14 St & 1 Ave,5779.10,40.74714,-73.97113,40.731393,-73.982867,member,2023-01-09,2023-01-09
3,6E10997509D2B7F6,electric_bike,E Burnside Ave & Ryer Ave,8397.02,E Burnside Ave & Ryer Ave,8397.02,40.850535,-73.901318,40.850535,-73.901318,casual,2023-01-05,2023-01-05
4,AA546E74A9330BD4,electric_bike,Clermont Ave & Park Ave,4692.01,Clermont Ave & Park Ave,4692.01,40.695734,-73.971297,40.695734,-73.971297,casual,2023-01-02,2023-01-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6A338EC25B9FB134,electric_bike,Alexander Ave & E 134 St,7712.04,Brook Ave & E 138 St,7707.08,40.807466,-73.927107,40.807408,-73.91924,member,2023-01-06,2023-01-06
96,74CDB1D3CC2F6AC8,electric_bike,Riverside Dr & W 82 St,7388.1,Riverside Dr & W 104 St,7623.13,40.787209,-73.981281,40.801343,-73.971146,member,2023-01-07,2023-01-07
97,1F552880DF9D1243,classic_bike,5 Ave & 3 St,3987.06,Berkeley Pl & 7 Ave,4051.01,40.672815,-73.983524,40.675147,-73.975232,member,2023-01-11,2023-01-11
98,190D82A72B925ECE,electric_bike,2 Ave & E 122 St,7622.12,E 123 St & Lexington Ave,7636.05,40.800672,-73.9349,40.802926,-73.9379,member,2023-01-14,2023-01-14


In [None]:
import duckdb

pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2022\*.csv"

con = duckdb.connect()

q = f"""
WITH trips AS (
  SELECT
    -- extrai a data diretamente do timestamp de texto
    CAST(strftime(started_at, '%Y-%m-%d') AS DATE) AS started_date,

    start_station_id,
    start_station_name,
    CAST(start_lat AS DOUBLE) AS start_lat,
    CAST(start_lng AS DOUBLE) AS start_lng,

    end_station_id,
    end_station_name,
    CAST(end_lat AS DOUBLE) AS end_lat,
    CAST(end_lng AS DOUBLE) AS end_lng
  FROM read_csv_auto('{pasta}',
        header = true,
        union_by_name = true,   -- junta colunas por nome entre arquivos
        SAMPLE_SIZE = -1        -- inspeciona todas as linhas para inferir tipos
  )
)
SELECT
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng,
  COUNT(*) AS total
FROM trips
GROUP BY
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng
ORDER BY started_date
"""

out = con.query(q).to_df()   # resultado final já pequeno cabe em pandas
print(out.head())


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  started_date start_station_id             start_station_name  start_lat  \
0   2022-12-14          4903.09             Dock St & Front St  40.702709   
1   2022-12-28          5329.03          West St & Chambers St  40.717548   
2   2022-12-28          7803.02            E 138 St & Park Ave  40.812636   
3   2022-12-28          4762.05  Flushing Ave & Vanderbilt Ave  40.697950   
4   2022-12-30          4637.06           Fulton St & Adams St  40.692418   

   start_lng end_station_id            end_station_name    end_lat    end_lng  \
0 -73.992530           None                        None        NaN        NaN   
1 -74.013221        6425.04            11 Ave & W 27 St  40.751396 -74.005226   
2 -73.929250           None                        None        NaN        NaN   
3 -73.970776        4724.03   Washington Ave & Park Ave  40.696102 -73.967510   
4 -73.989495        5106.04  Greene Ave & Grandview Ave  40.709697 -73.907856   

   total  
0      1  
1      1  
2      1  
3     

In [15]:
len(out)

24022074

In [None]:
from pathlib import Path

# caminho de saída
pasta = Path(r"C:\Riguel\Master\Clases\TFM\py\bike 2022")
arquivo_saida = pasta / "citibike_22.parquet"

# salvar em Parquet (requer pyarrow ou fastparquet)
out.to_parquet(arquivo_saida, index=False, engine="pyarrow")

print(f"✅ Arquivo Parquet salvo com sucesso em:\n{arquivo_saida}")

✅ Arquivo Parquet salvo com sucesso em:
C:\Riguel\Master\Clases\TFM\py\bike 2023\citibike_23.parquet
