In [1]:
import os
import gc
from google.colab import drive
from google.colab import userdata

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
project_folder = '/content/drive/MyDrive/SU Works/CPSC_5310_Project'
dataset_folder = os.path.join(project_folder, 'dataset')

In [4]:
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    print(f"Created new folder: {dataset_folder}")
else:
    print(f"Folder already exists: {dataset_folder}")

Folder already exists: /content/drive/MyDrive/SU Works/CPSC_5310_Project/dataset


In [5]:
# Check if the data is already there
# (Checking for one of the expected CSV files)
expected_file = os.path.join(dataset_folder, 'yellow_tripdata_2016-01.csv')

if not os.path.exists(expected_file):
    print("Data not found in Drive. Downloading from Kaggle...")

    # Authenticate securely
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

    # Change directory to Drive folder so files land there
    %cd "$dataset_folder"

    # Download and Unzip
    !kaggle datasets download -d elemento/nyc-yellow-taxi-trip-data
    !unzip -q nyc-yellow-taxi-trip-data.zip

    # Cleanup: Remove the zip file to save Drive space
    !rm nyc-yellow-taxi-trip-data.zip
    print("Download and extraction complete!")

else:
    print("Data found in Drive! Skipping download.")

Data found in Drive! Skipping download.


# Loading the Data

In [6]:
import glob
import pandas as pd
import time
import shutil

In [7]:
drive_files = glob.glob(os.path.join(dataset_folder, "*.csv"))
print(f"Found {len(drive_files)} files: {[os.path.basename(f) for f in drive_files]}")

Found 4 files: ['yellow_tripdata_2015-01.csv', 'yellow_tripdata_2016-01.csv', 'yellow_tripdata_2016-02.csv', 'yellow_tripdata_2016-03.csv']


**Issue Identified:**
During the initial inspection, inconsistencies were observed in column naming conventions across different monthly CSV files. Specifically, the rate code column appears as both `RateCodeID` (CamelCase) and `RatecodeID` (PascalCase/lowercase 'c') depending on the specific file version.

To ensure data integrity during concatenation:
1.  All column names will be stripped of potential whitespace.
2.  Any instance of `RatecodeID` is programmatically renamed to `RateCodeID` to enforce a unified schema.
This prevents the creation of duplicate columns and `NaN` voids in the final dataset.

In [8]:
start_read = time.time()
total_expected_rows = 0


dataframes = []
for f in drive_files:
    df_part = pd.read_csv(f, engine='pyarrow')
    df_part.columns = df_part.columns.str.strip()

    if 'RatecodeID' in df_part.columns:
        df_part.rename(columns={'RatecodeID': 'RateCodeID'}, inplace=True)

    rows = len(df_part)
    total_expected_rows += rows
    print(f" -> {os.path.basename(f)}: {rows:,} rows")

    dataframes.append(df_part)

full_df = pd.concat(dataframes, ignore_index=True)

print(f"Loaded {len(full_df):,} rows in {time.time() - start_read:.2f} seconds")

 -> yellow_tripdata_2015-01.csv: 12,748,986 rows
 -> yellow_tripdata_2016-01.csv: 10,906,858 rows
 -> yellow_tripdata_2016-02.csv: 11,382,049 rows
 -> yellow_tripdata_2016-03.csv: 12,210,952 rows
Loaded 47,248,845 rows in 94.73 seconds


In [9]:
if dataframes:
    full_df = pd.concat(dataframes, ignore_index=True)

    del dataframes
    gc.collect()


    # Check memory footprint
    mem_usage = full_df.memory_usage(deep=True).sum() / 1024**3  # Convert to GB
    print(f"Total Memory Usage: {mem_usage:.2f} GB")

    print("\n--- Integrity Check ---")
    print(f"Sum of CSV rows:    {total_expected_rows:,}")
    print(f"Final Dataframe rows: {len(full_df):,}")

    if total_expected_rows == len(full_df):
        print("Integrity check passed!")
else:
    print("No files found to combine.")

Total Memory Usage: 8.54 GB

--- Integrity Check ---
Sum of CSV rows:    47,248,845
Final Dataframe rows: 47,248,845
Integrity check passed!


## Memory Optimization

In [10]:
initial_memory = full_df.memory_usage(deep=True).sum() / 1024**3
print(f"Initial Memory Usage: {initial_memory:.2f} GB")

Initial Memory Usage: 8.54 GB


In [11]:
optimization_dtypes = {
    # Small Integers (Categorical Codes)
    'VendorID': 'int8',
    'passenger_count': 'int8',
    'RateCodeID': 'int8',
    'payment_type': 'int8',

    # Text Category (Huge savings for repeated strings like 'Y'/'N')
    'store_and_fwd_flag': 'category',

    # Floats (Downcast from 64 to 32)
    'trip_distance': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'fare_amount': 'float32',
    'extra': 'float32',
    'mta_tax': 'float32',
    'tip_amount': 'float32',
    'tolls_amount': 'float32',
    'improvement_surcharge': 'float32',
    'total_amount': 'float32'
}

In [12]:
print("Optimizing data types...")
for col, dtype in optimization_dtypes.items():
    if col in full_df.columns:
        full_df[col] = full_df[col].astype(dtype)

final_memory = full_df.memory_usage(deep=True).sum() / 1024**3
print(f"Final Memory Usage:   {final_memory:.2f} GB")
print(f"Saved:                {initial_memory - final_memory:.2f} GB ({(initial_memory - final_memory)/initial_memory:.1%} reduction)")

Optimizing data types...
Final Memory Usage:   3.04 GB
Saved:                5.50 GB (64.4% reduction)


In [13]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47248845 entries, 0 to 47248844
Data columns (total 19 columns):
 #   Column                 Dtype        
---  ------                 -----        
 0   VendorID               int8         
 1   tpep_pickup_datetime   datetime64[s]
 2   tpep_dropoff_datetime  datetime64[s]
 3   passenger_count        int8         
 4   trip_distance          float32      
 5   pickup_longitude       float32      
 6   pickup_latitude        float32      
 7   RateCodeID             int8         
 8   store_and_fwd_flag     category     
 9   dropoff_longitude      float32      
 10  dropoff_latitude       float32      
 11  payment_type           int8         
 12  fare_amount            float32      
 13  extra                  float32      
 14  mta_tax                float32      
 15  tip_amount             float32      
 16  tolls_amount           float32      
 17  improvement_surcharge  float32      
 18  total_amount           float32      
dty

### Saving the Full Dataframe

In [14]:
save_data_dir = os.path.join(project_folder, 'saved_data')

In [15]:
if not os.path.exists(save_data_dir):
    os.makedirs(save_data_dir)
    print(f"Created new folder: {save_data_dir}")
else:
    print(f"Folder already exists: {save_data_dir}")


save_path = os.path.join(save_data_dir, 'full_dataset.parquet')
full_df.to_parquet(save_path)
print("Saved to Parquet!")

Folder already exists: /content/drive/MyDrive/SU Works/CPSC_5310_Project/saved_data
Saved to Parquet!
