In [1]:
# Import libraries
import pandas as pd
from pathlib import Path
import glob
import zipfile

In [2]:
# Examine each zip file to ensure columns/data match

# Path to the each ZIP file
zip_path = "Resources/202412-citibike-tripdata.zip"

# Open the ZIP file and read its first CSV
with zipfile.ZipFile(zip_path, 'r') as z:
    csv_filename = z.namelist()[0]  # Get the first file inside the ZIP
    print(f"Extracting: {csv_filename}")  # Show the file name inside the ZIP
    with z.open(csv_filename) as f:
        df = pd.read_csv(f,low_memory=False)  # Load CSV into DataFrame

# Display basic info about the DataFrame
print(df.info())  # Shows column names, data types, and missing values
print(df.head())  # Shows first 5 rows


Extracting: 202412-citibike-tripdata_1.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             1000000 non-null  object 
 1   rideable_type       1000000 non-null  object 
 2   started_at          1000000 non-null  object 
 3   ended_at            1000000 non-null  object 
 4   start_station_name  999375 non-null   object 
 5   start_station_id    999375 non-null   object 
 6   end_station_name    996417 non-null   object 
 7   end_station_id      995525 non-null   object 
 8   start_lat           1000000 non-null  float64
 9   start_lng           1000000 non-null  float64
 10  end_lat             999795 non-null   float64
 11  end_lng             999795 non-null   float64
 12  member_casual       1000000 non-null  object 
dtypes: float64(4), object(9)
memory usage: 99.2+ MB
None
            ride_id  r

In [3]:
df.nunique()

ride_id               1000000
rideable_type               2
started_at             999269
ended_at               999235
start_station_name       2137
start_station_id         2138
end_station_name         2129
end_station_id           2148
start_lat                2160
start_lng                2153
end_lat                  2144
end_lng                  2135
member_casual               2
dtype: int64

In [10]:
# Get all ZIP files in the directory
zip_files = glob.glob("Resources/*.zip")

# Define chunk size (adjust based on RAM capacity)
chunk_size = 100_000  # Process 100,000 rows at a time

# Placeholder for merged DataFrame (use a list to store processed chunks)
dfs = []

# Iterate over each ZIP file
for zip_file in zip_files:
    with zipfile.ZipFile(zip_file, 'r') as z:
        # Iterate over all CSV files inside the ZIP
        for csv_filename in z.namelist():
            if csv_filename.endswith('.csv'):  # Check if it's a CSV file
                print(f"Processing file: {csv_filename}")  # Show the current file being processed
                
                # Open the CSV file inside the ZIP and read it in chunks
                with z.open(csv_filename) as f:
                    for chunk in pd.read_csv(f, chunksize=chunk_size, low_memory=False):
                        chunk.drop_duplicates(inplace=True)
                        chunk.dropna()
                        dfs.append(chunk)

# Merge all chunks into one final DataFrame
df_final = pd.concat(dfs, ignore_index=True)

# Display basic information about the final DataFrame
print(df_final.info())


Processing file: 202412-citibike-tripdata_1.csv
Processing file: 202412-citibike-tripdata_3.csv
Processing file: 202412-citibike-tripdata_2.csv
Processing file: 202501-citibike-tripdata_1.csv
Processing file: 202501-citibike-tripdata_3.csv
Processing file: 202501-citibike-tripdata_2.csv
Processing file: 202411-citibike-tripdata_3.csv
Processing file: 202411-citibike-tripdata_2.csv
Processing file: 202411-citibike-tripdata_1.csv
Processing file: 202411-citibike-tripdata_4.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8145780 entries, 0 to 8145779
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_la

In [11]:
# Save to CSV
df_final.to_csv("Output/citybike_cleaned_data.csv", index=False)