In [None]:
from pathlib import Path
import pandas as pd

def empilhar_csvs(pasta_csv: str, padrao: str = "*.csv") -> pd.DataFrame:
   
    arquivos = sorted(Path(pasta_csv).glob(padrao))
    if not arquivos:
        raise FileNotFoundError(f"Nenhum CSV encontrado em {pasta_csv} com padrão {padrao}")

    
    dfs = (
        pd.read_csv(
            arq,
            dtype_backend="pyarrow", 
            low_memory=False
        )
        for arq in arquivos
    )
    df = pd.concat(dfs, ignore_index=True)
    return df

In [None]:
pasta = r"C:\Riguel\Master\Clases\TFM\NYC\bici\csv"


df = empilhar_csvs(pasta)

In [3]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8E865410DBDE0CA9,electric_bike,2024-01-01 13:00:04.563,2024-01-01 13:04:04.652,3 St & 3 Ave,4028.03,Carroll St & Smith St,4225.14,40.67507,-73.987752,40.680611,-73.994758,casual
1,0403D0B3FC9CA77D,electric_bike,2024-01-08 19:36:43.520,2024-01-08 19:53:16.266,Franklin Ave & St Marks Ave,4107.05,Bedford Ave & Bergen St,4066.15,40.675832,-73.956168,40.676368,-73.952918,casual
2,F6DE7BB42FF550BE,electric_bike,2024-01-12 15:00:41.580,2024-01-12 15:36:29.622,W 67 St & Broadway,7116.04,Central Park W & W 103 St,7577.27,40.774925,-73.982666,40.79559,-73.961884,casual
3,84A995BFD98030D4,classic_bike,2024-01-12 16:52:19.025,2024-01-12 17:17:29.773,Central Park West & W 68 St,7079.06,E 5 St & Ave C,5545.04,40.773407,-73.977825,40.722992,-73.979955,member
4,7BBEAD4F2B535813,electric_bike,2024-01-05 19:50:19.202,2024-01-05 20:34:42.517,W 67 St & Broadway,7116.04,Ave A & E 14 St,5779.11,40.774925,-73.982666,40.730311,-73.980472,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,F3A3EF2E2C3CD50F,classic_bike,2024-01-10 10:32:30.451,2024-01-10 10:43:52.881,Dean St & Franklin Ave,4107.13,Lafayette Ave & Ft Greene Pl,4470.09,40.677592,-73.955637,40.687002,-73.97665,member
96,CF6A91FEE8EE7FE9,classic_bike,2024-01-04 15:45:13.372,2024-01-04 16:03:44.822,Central Park West & W 72 St,7141.07,9 Ave & W 39 St,6644.08,40.775794,-73.976206,40.756404,-73.994101,casual
97,6738F2BA4B35CA86,electric_bike,2024-01-12 15:01:19.804,2024-01-12 15:24:12.731,Wyckoff Ave & Gates Ave,4847.03,Lafayette Ave & Ft Greene Pl,4470.09,40.699871,-73.911719,40.687002,-73.97665,member
98,F47DC30E11982DBF,classic_bike,2024-01-08 15:57:29.484,2024-01-08 16:00:11.984,1 Ave & E 18 St,5854.09,Ave A & E 14 St,5779.11,40.733812,-73.980544,40.730311,-73.980472,member


In [4]:
len(df)

44303209

In [5]:
df["started_at"] = df["started_at"].astype(str)
df["ended_at"] = df["ended_at"].astype(str)

In [None]:

df["started_at"] = df["started_at"].astype("string[pyarrow]")
df["ended_at"]   = df["ended_at"].astype("string[pyarrow]")


df["started_date"] = df["started_at"].str.partition(" ")[0]
df["ended_date"]   = df["ended_at"].str.partition(" ")[0]


df.drop(columns=["started_at", "ended_at"], inplace=True)

In [7]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_date,ended_date
0,8E865410DBDE0CA9,electric_bike,3 St & 3 Ave,4028.03,Carroll St & Smith St,4225.14,40.67507,-73.987752,40.680611,-73.994758,casual,2024-01-01,2024-01-01
1,0403D0B3FC9CA77D,electric_bike,Franklin Ave & St Marks Ave,4107.05,Bedford Ave & Bergen St,4066.15,40.675832,-73.956168,40.676368,-73.952918,casual,2024-01-08,2024-01-08
2,F6DE7BB42FF550BE,electric_bike,W 67 St & Broadway,7116.04,Central Park W & W 103 St,7577.27,40.774925,-73.982666,40.79559,-73.961884,casual,2024-01-12,2024-01-12
3,84A995BFD98030D4,classic_bike,Central Park West & W 68 St,7079.06,E 5 St & Ave C,5545.04,40.773407,-73.977825,40.722992,-73.979955,member,2024-01-12,2024-01-12
4,7BBEAD4F2B535813,electric_bike,W 67 St & Broadway,7116.04,Ave A & E 14 St,5779.11,40.774925,-73.982666,40.730311,-73.980472,member,2024-01-05,2024-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,F3A3EF2E2C3CD50F,classic_bike,Dean St & Franklin Ave,4107.13,Lafayette Ave & Ft Greene Pl,4470.09,40.677592,-73.955637,40.687002,-73.97665,member,2024-01-10,2024-01-10
96,CF6A91FEE8EE7FE9,classic_bike,Central Park West & W 72 St,7141.07,9 Ave & W 39 St,6644.08,40.775794,-73.976206,40.756404,-73.994101,casual,2024-01-04,2024-01-04
97,6738F2BA4B35CA86,electric_bike,Wyckoff Ave & Gates Ave,4847.03,Lafayette Ave & Ft Greene Pl,4470.09,40.699871,-73.911719,40.687002,-73.97665,member,2024-01-12,2024-01-12
98,F47DC30E11982DBF,classic_bike,1 Ave & E 18 St,5854.09,Ave A & E 14 St,5779.11,40.733812,-73.980544,40.730311,-73.980472,member,2024-01-08,2024-01-08


In [None]:
import duckdb

query = """
SELECT
    started_date,
    start_station_id,
    start_station_name,
    start_lat,
    start_lng,
    end_station_id,
    end_station_name,
    end_lat,
    end_lng,
    COUNT(*) AS total
FROM df
GROUP BY
    started_date,
    start_station_id,
    start_station_name,
    end_station_id,
    end_station_name,
    start_lat,
    start_lng,
    end_lat,
    end_lng
"""


df = duckdb.query(query).to_df()

print("✅ DataFrame agrupado com sucesso!")
print(df.head())



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ DataFrame agrupado com sucesso!
  started_date start_station_id                 start_station_name  start_lat  \
0   2024-01-01          6013.12  Gramercy Park N & Gramercy Park E  40.737950   
1   2024-01-03          6756.01                   W 43 St & 10 Ave  40.760094   
2   2024-01-06          5506.14                     E 6 St & Ave D  40.722281   
3   2024-01-08          5382.06            Grand St & Elizabeth St  40.718822   
4   2024-01-05          4729.01             Myrtle Ave & Lewis Ave  40.696820   

   start_lng end_station_id           end_station_name    end_lat    end_lng  \
0 -73.985090        5414.07  Delancey St & Eldridge St  40.719383 -73.991479   
1 -73.994618        6560.01         Broadway & W 41 St  40.755136 -73.986580   
2 -73.976687        5414.07  Delancey St & Eldridge St  40.719383 -73.991479   
3 -73.995960        5181.04                  Cherry St  40.712199 -73.979481   
4 -73.937569        4214.03    Macon St & Nostrand Ave  40.680983 -73.950048   

In [10]:
df.head(100)

Unnamed: 0,started_date,start_station_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,total
0,2024-01-01,6013.12,Gramercy Park N & Gramercy Park E,40.737950,-73.985090,5414.07,Delancey St & Eldridge St,40.719383,-73.991479,1
1,2024-01-03,6756.01,W 43 St & 10 Ave,40.760094,-73.994618,6560.01,Broadway & W 41 St,40.755136,-73.986580,3
2,2024-01-06,5506.14,E 6 St & Ave D,40.722281,-73.976687,5414.07,Delancey St & Eldridge St,40.719383,-73.991479,1
3,2024-01-08,5382.06,Grand St & Elizabeth St,40.718822,-73.995960,5181.04,Cherry St,40.712199,-73.979481,1
4,2024-01-05,4729.01,Myrtle Ave & Lewis Ave,40.696820,-73.937569,4214.03,Macon St & Nostrand Ave,40.680983,-73.950048,1
...,...,...,...,...,...,...,...,...,...,...
95,2024-01-04,4953.04,South St & Gouverneur Ln,40.703554,-74.006702,5181.04,Cherry St,40.712199,-73.979481,2
96,2024-01-09,6432.1,E 41 St & Madison Ave (SE corner),40.752049,-73.979635,6955.05,W 54 St & 11 Ave,40.768333,-73.992573,1
97,2024-01-13,4368.05,Monroe St & Bedford Ave,40.685144,-73.953809,4033.16,Sterling Pl & Franklin Ave,40.673248,-73.956895,1
98,2024-01-11,6998.08,E 72 St & Park Ave,40.771183,-73.964094,6560.01,Broadway & W 41 St,40.755136,-73.986580,1


In [11]:
len(df)

29919547

In [None]:
from pathlib import Path


pasta = Path(r"C:\Riguel\Master\Clases\TFM\NYC\bici")
arquivo_saida = pasta / "citibike.parquet"


df.to_parquet(arquivo_saida, index=False, engine="pyarrow")

print(f"✅ Arquivo Parquet salvo com sucesso em:\n{arquivo_saida}")

✅ Arquivo Parquet salvo com sucesso em:
C:\Riguel\Master\Clases\TFM\NYC\bici\citibike.parquet
