In [None]:
from pathlib import Path
import pandas as pd

def empilhar_csvs(pasta_csv: str, padrao: str = "*.csv") -> pd.DataFrame:
   
    arquivos = sorted(Path(pasta_csv).glob(padrao))
    if not arquivos:
        raise FileNotFoundError(f"Nenhum CSV encontrado em {pasta_csv} com padrão {padrao}")

    
    dfs = (
        pd.read_csv(
            arq,
            dtype_backend="pyarrow",  
            low_memory=False
        )
        for arq in arquivos
    )
    df = pd.concat(dfs, ignore_index=True)
    return df

In [None]:
pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2020"


df = empilhar_csvs(pasta)

In [3]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.10,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,113BBBA59A036626,electric_bike,2022-01-07 15:40:33.636,2022-01-07 16:21:58.651,Park Cir & East Dr,3344.02,W 25 St & 6 Ave,6215.04,40.651566,-73.972212,40.743954,-73.991449,member
96,36087FEE423BAC05,electric_bike,2022-01-19 12:05:09.925,2022-01-19 12:13:05.620,Cathedral Pkwy & Broadway,7680.03,Mt Morris Park W & W 120 St,7685.14,40.804213,-73.966991,40.804038,-73.945925,member
97,1CE4A3035760CA46,electric_bike,2022-01-18 15:51:20.697,2022-01-18 15:59:06.378,Cleveland Pl & Spring St,5492.05,Rivington St & Ridge St,5406.02,40.722104,-73.997249,40.718502,-73.983299,member
98,C2DEE105F544943A,electric_bike,2022-01-22 01:47:57.864,2022-01-22 01:55:13.778,49 Ave & 21 St,6128.04,Calyer St & Jewel St,5743.02,40.74252,-73.948852,40.72984,-73.94839,member


In [4]:
len(df)

29838806

In [5]:
df["started_at"] = df["started_at"].astype(str)
df["ended_at"] = df["ended_at"].astype(str)

In [None]:

df["started_at"] = df["started_at"].astype("string[pyarrow]")
df["ended_at"]   = df["ended_at"].astype("string[pyarrow]")

df["started_date"] = df["started_at"].str.partition(" ")[0]
df["ended_date"]   = df["ended_at"].str.partition(" ")[0]


df.drop(columns=["started_at", "ended_at"], inplace=True)

In [7]:
df.head(100)

Unnamed: 0,ride_id,rideable_type,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_date,ended_date
0,BFD29218AB271154,electric_bike,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member,2022-01-21,2022-01-21
1,7C953F2FD7BE1302,classic_bike,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member,2022-01-10,2022-01-10
2,95893ABD40CED4B8,electric_bike,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member,2022-01-26,2022-01-26
3,F853B50772137378,classic_bike,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member,2022-01-03,2022-01-03
4,7590ADF834797B4B,classic_bike,6 Ave & W 34 St,6364.10,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member,2022-01-22,2022-01-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,113BBBA59A036626,electric_bike,Park Cir & East Dr,3344.02,W 25 St & 6 Ave,6215.04,40.651566,-73.972212,40.743954,-73.991449,member,2022-01-07,2022-01-07
96,36087FEE423BAC05,electric_bike,Cathedral Pkwy & Broadway,7680.03,Mt Morris Park W & W 120 St,7685.14,40.804213,-73.966991,40.804038,-73.945925,member,2022-01-19,2022-01-19
97,1CE4A3035760CA46,electric_bike,Cleveland Pl & Spring St,5492.05,Rivington St & Ridge St,5406.02,40.722104,-73.997249,40.718502,-73.983299,member,2022-01-18,2022-01-18
98,C2DEE105F544943A,electric_bike,49 Ave & 21 St,6128.04,Calyer St & Jewel St,5743.02,40.74252,-73.948852,40.72984,-73.94839,member,2022-01-22,2022-01-22


In [None]:
import duckdb

pasta = r"C:\Riguel\Master\Clases\TFM\py\bike 2020\*.csv"

con = duckdb.connect()

q = f"""
WITH trips AS (
  SELECT
    
    CAST(strftime(started_at, '%Y-%m-%d') AS DATE) AS started_date,

    start_station_id,
    start_station_name,
    CAST(start_lat AS DOUBLE) AS start_lat,
    CAST(start_lng AS DOUBLE) AS start_lng,

    end_station_id,
    end_station_name,
    CAST(end_lat AS DOUBLE) AS end_lat,
    CAST(end_lng AS DOUBLE) AS end_lng
  FROM read_csv_auto('{pasta}',
        header = true,
        union_by_name = true,   
        SAMPLE_SIZE = -1        
  )
)
SELECT
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng,
  COUNT(*) AS total
FROM trips
GROUP BY
  started_date,
  start_station_id,
  start_station_name,
  start_lat, start_lng,
  end_station_id,
  end_station_name,
  end_lat, end_lng
ORDER BY started_date
"""

out = con.query(q).to_df()   
print(out.head())


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  started_date start_station_id                   start_station_name  \
0   2021-01-30          7932.10  W 144 St & Adam Clayton Powell Blvd   
1   2021-02-15          5062.01                   Kent Ave & S 11 St   
2   2021-03-11          4175.15                     Union St & 4 Ave   
3   2021-03-14          5450.04                N 12 St & Bedford Ave   
4   2021-03-31          4628.05             Willoughby St & Fleet St   

   start_lat  start_lng end_station_id                    end_station_name  \
0  40.820877 -73.939249        7631.23  Frederick Douglass Blvd & W 112 St   
1  40.707645 -73.968415        5300.06              Union Ave & Jackson St   
2  40.677274 -73.982820        3704.04            Flatbush Ave & Ocean Ave   
3  40.720798 -73.954847        4354.05          S Portland Ave & Hanson Pl   
4  40.691966 -73.981302        4528.01           DeKalb Ave & Franklin Ave   

     end_lat    end_lng  total  
0  40.801694 -73.957145      1  
1  40.716075 -73.952029      1  

In [9]:
len(out)

21468339

In [None]:
from pathlib import Path


pasta = Path(r"C:\Riguel\Master\Clases\TFM\py\bike 2020")
arquivo_saida = pasta / "citibike_20.parquet"


out.to_parquet(arquivo_saida, index=False, engine="pyarrow")

print(f"✅ Arquivo Parquet salvo com sucesso em:\n{arquivo_saida}")

✅ Arquivo Parquet salvo com sucesso em:
C:\Riguel\Master\Clases\TFM\py\bike 2022\citibike_22.parquet
