# Data ETL 

In [11]:
from email.contentmanager import raw_data_manager

# Dependence
import pandas as pd
import os
import random

from warnings import filterwarnings
filterwarnings('ignore')

In [12]:
# Function to read and sample data from multiple CSVs for a specific month
def read_and_sample_monthly_data(month_folder_path, sample_frac=0.05):
    # List to store data for the month
    monthly_data = []

    # Loop through each file in the month folder and read it
    for file in os.listdir(month_folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(month_folder_path, file)
            print(f"Reading file: {file_path}")
            # Read CSV file into a DataFrame
            df = pd.read_csv(file_path)
            # Randomly sample 5% of the data
            sampled_df = df.sample(frac=sample_frac, random_state=42)
            # Append sampled data to the monthly data list
            monthly_data.append(sampled_df)

    # Concatenate all sampled data for the month
    if monthly_data:
        return pd.concat(monthly_data, ignore_index=True)
    else:
        return pd.DataFrame()  # Return empty DataFrame if no data is found

In [13]:
# Function to process all months and sample 5% data
def process_all_data(base_folder, year_month_folders):
    all_sampled_data = []

    # Iterate through each year-month folder and process the files
    for year_month_folder in year_month_folders:
        folder_path = os.path.join(base_folder, year_month_folder)
        print(f"Processing folder: {folder_path}")
        # Read and sample monthly data
        monthly_sampled_data = read_and_sample_monthly_data(folder_path)
        # Append sampled data to the overall list
        if not monthly_sampled_data.empty:
            all_sampled_data.append(monthly_sampled_data)

    # Concatenate all monthly sampled data into a single DataFrame
    if all_sampled_data:
        return pd.concat(all_sampled_data, ignore_index=True)
    else:
        return pd.DataFrame()  # Return empty DataFrame if no data is found

In [14]:
# Since the Jupyter notebook is in the same folder, base folder is '.'
base_folder = './'

In [15]:
# List of folders (by year and month) that need to be processed
year_month_folders = [
    '2022-citibike-tripdata/202201-citibike-tripdata',
    '2022-citibike-tripdata/202202-citibike-tripdata',
    '2022-citibike-tripdata/202203-citibike-tripdata',
    '2022-citibike-tripdata/202204-citibike-tripdata',
    '2022-citibike-tripdata/202205-citibike-tripdata',
    '2022-citibike-tripdata/202206-citibike-tripdata',
    '2022-citibike-tripdata/202207-citibike-tripdata',
    '2022-citibike-tripdata/202208-citibike-tripdata',
    '2022-citibike-tripdata/202209-citibike-tripdata',
    '2022-citibike-tripdata/202210-citibike-tripdata',
    '2022-citibike-tripdata/202211-citibike-tripdata',
    '2022-citibike-tripdata/202212-citibike-tripdata',
    '2023-citibike-tripdata/202301-citibike-tripdata',
    '2023-citibike-tripdata/202302-citibike-tripdata',
    '2023-citibike-tripdata/202303-citibike-tripdata',
    '2023-citibike-tripdata/202304-citibike-tripdata',
    '2023-citibike-tripdata/202305-citibike-tripdata',
    '2023-citibike-tripdata/202306-citibike-tripdata',
    '2023-citibike-tripdata/202307-citibike-tripdata',
    '2023-citibike-tripdata/202308-citibike-tripdata',
    '2023-citibike-tripdata/202309-citibike-tripdata',
    '2023-citibike-tripdata/202310-citibike-tripdata',
    '2023-citibike-tripdata/202311-citibike-tripdata',
    '2023-citibike-tripdata/202312-citibike-tripdata'
]

In [16]:
# Process all the data and sample 5%
all_sampled_data = process_all_data(base_folder, year_month_folders)

# Check the shape of the final concatenated data
print(f"Total sampled data shape: {all_sampled_data.shape}")

Processing folder: ./2022-citibike-tripdata/202201-citibike-tripdata
Reading file: ./2022-citibike-tripdata/202201-citibike-tripdata/202201-citibike-tripdata_2.csv
Reading file: ./2022-citibike-tripdata/202201-citibike-tripdata/202201-citibike-tripdata_1.csv
Processing folder: ./2022-citibike-tripdata/202202-citibike-tripdata
Reading file: ./2022-citibike-tripdata/202202-citibike-tripdata/202202-citibike-tripdata_1.csv
Reading file: ./2022-citibike-tripdata/202202-citibike-tripdata/202202-citibike-tripdata_2.csv
Processing folder: ./2022-citibike-tripdata/202203-citibike-tripdata
Reading file: ./2022-citibike-tripdata/202203-citibike-tripdata/202203-citibike-tripdata_2.csv
Reading file: ./2022-citibike-tripdata/202203-citibike-tripdata/202203-citibike-tripdata_1.csv
Processing folder: ./2022-citibike-tripdata/202204-citibike-tripdata
Reading file: ./2022-citibike-tripdata/202204-citibike-tripdata/202204-citibike-tripdata_1.csv
Reading file: ./2022-citibike-tripdata/202204-citibike-trip

In [17]:
# Export the final sampled data to a CSV file
output_file = './NYC_tripdata_sampled.csv'  # Output file in the same directory
all_sampled_data.to_csv(output_file, index=False)
print(f"Sampled data saved to {output_file}")

Sampled data saved to ./NYC_tripdata_sampled.csv
