In [2]:
import os
import pandas as pd
import json

In [6]:
data_folder = "../raw_data"

yellow_data_path = os.path.join(data_folder, "yellow_data.csv")
green_data_path = os.path.join(data_folder, "green_data.csv")
preprocessed_data_path = os.path.join(data_folder, "preprocessed_taxi_data.csv")
weather_data_path = os.path.join(data_folder, "weather_data.json")
zone_data_path = os.path.join(data_folder, "taxi_zones.csv")
merged_data_path = os.path.join(data_folder, "merged_data.csv")
merged2_data_path = os.path.join(data_folder, "merged2_data.csv")
merged3_data_path = os.path.join(data_folder, "merged3_data.csv")

CHUNK_SIZE = 10000

In [3]:
def preprocess_chunk(chunk, taxi_type, is_green=False):
    chunk['taxi_type'] = taxi_type
    # Standardize column names
    chunk.rename(columns={
        'tpep_pickup_datetime': 'pickup_datetime',
        'tpep_dropoff_datetime': 'dropoff_datetime',
        'lpep_pickup_datetime': 'pickup_datetime',
        'lpep_dropoff_datetime': 'dropoff_datetime',
    }, inplace=True)
    # Drop unnecessary columns for green taxis
    if is_green:
        chunk.drop(columns=['trip_type'], inplace=True)
        chunk.drop(columns=['ehail_fee'], inplace=True)
    chunk.dropna(inplace=True)
    return chunk

In [None]:
is_first_chunk = True

# Process Green Taxi Data in Chunks
print("Processing Green Taxi Data...")
for green_chunk in pd.read_csv(green_data_path, chunksize=CHUNK_SIZE, low_memory=False):
    green_chunk = preprocess_chunk(green_chunk, taxi_type='green', is_green=True)
    green_chunk.to_csv(preprocessed_data_path, mode='w' if is_first_chunk else 'a', header=is_first_chunk, index=False)
    is_first_chunk = False

# Process Yellow Taxi Data in Chunks
print("Processing Yellow Taxi Data...")
for yellow_chunk in pd.read_csv(yellow_data_path, chunksize=CHUNK_SIZE, low_memory=False):
    yellow_chunk = preprocess_chunk(yellow_chunk, taxi_type='yellow')
    yellow_chunk.to_csv(preprocessed_data_path, mode='w' if is_first_chunk else 'a', header=is_first_chunk, index=False)
    is_first_chunk = False

print(f"Final preprocessed data saved to '{preprocessed_data_path}'")


In [7]:
with open(weather_data_path, 'r') as f:
    weather_json = json.load(f)
weather_data = weather_json["data"]
weather_df = pd.DataFrame(weather_data)
weather_df = weather_df.drop_duplicates()
weather_df = weather_df.drop(columns=['tsun', 'wpgt'])

zone_df = pd.read_csv(zone_data_path)
zone_df = zone_df.drop_duplicates()

# Ensure weather data date format is correct and convert to datetime object
weather_df['date'] = pd.to_datetime(weather_df['date'], format='%Y-%m-%d', errors='coerce')

# Preview date formats
print(weather_df)
print(zone_df)


          date  tavg  tmin  tmax  prcp  snow   wdir  wspd    pres
0   2021-01-01   3.6   2.2   4.4  17.3   0.0   36.0  13.2  1029.4
1   2021-01-02   6.3   3.3  12.2   6.1   0.0  319.0  11.6  1012.9
2   2021-01-03   3.4   2.8   4.4   5.7   0.0   33.0  22.4  1017.1
3   2021-01-04   4.6   2.8   7.2   0.7   0.0    7.0   8.1  1015.0
4   2021-01-05   4.4   3.3   5.6   0.0   0.0    0.0   8.1  1013.5
..         ...   ...   ...   ...   ...   ...    ...   ...     ...
360 2021-12-27   1.4  -0.6   4.1   5.3   0.0   33.0   8.9  1017.2
361 2021-12-28   6.1   3.7   9.0   3.4   0.0  297.0   8.0  1010.3
362 2021-12-29   6.3   4.4   9.9   7.7   0.0   49.0   9.4  1012.0
363 2021-12-30   6.4   4.1   9.9   1.7   0.0   33.0   5.7  1013.9
364 2021-12-31   8.2   5.7  10.1   0.3   0.0  218.0   8.0  1013.9

[365 rows x 9 columns]
     location_id        borough                       zone
0              1            EWR             Newark Airport
1              2         Queens                Jamaica Bay
2      

In [None]:


print("Merging taxi data with weather data...")

for chunk in pd.read_csv(
    preprocessed_data_path,
    chunksize=CHUNK_SIZE,
    parse_dates=['dropoff_datetime']):
    # Extract the date from dropoff_datetime and convert to datetime
    chunk['date'] = chunk['dropoff_datetime'].dt.normalize()  # Extract the date part

    # Ensure chunk['date'] is properly converted to datetime
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')

    # Ensure weather_df['date'] is also datetime before merging
    weather_df['date'] = pd.to_datetime(weather_df['date'], errors='coerce').dt.normalize()

    # Debugging: print first few values and types to check conversion
    print(f"chunk['date'] preview: {chunk['date'].head()}")

    # Merge the chunk with weather data on the 'date' column
    merged_chunk = chunk.merge(weather_df, how='left', on='date')

    # Save merged data incrementally
    merged_chunk.to_csv(merged_data_path, mode='w' if is_first_chunk else 'a', header=is_first_chunk, index=False)
    is_first_chunk = False

print(f"Merged data saved to '{merged_data_path}'")

In [1]:
CHUNK_SIZE = 10000
n_total_count = 0  # Variable to accumulate the count of 'N'
n2_total_count = 0
# Read the data in chunks
for chunk in pd.read_csv(merged_data_path, chunksize=CHUNK_SIZE):
    # Count the occurrences of 'N' in DOLocationID for the current chunk
    n_count = (chunk['DOLocationID'] == 'N').sum()
    n_count2 = (chunk['PULocationID'] == 'N').sum()

    # Accumulate the count
    n_total_count += n_count
    n2_total_count += n_count2

    # Debugging: Print the count for the current chunk
    print(f"Count of 'N' in DOLocationID for this chunk: {n_count}")
    print(f"Count of 'N2' in PULocationID for this chunk: {n_count2}")

# After processing all chunks, print the total count of 'N' in DOLocationID
print(f"Total count of 'N' in DOLocationID: {n_total_count}")
print(f"Total count of 'N2' in PULocationID: {n2_total_count}")


NameError: name 'pd' is not defined

In [9]:
import pandas as pd

processed_trip_data = pd.DataFrame()
is_first_chunk = True

CHUNK_SIZE = 10000

for chunk in pd.read_csv(merged_data_path, chunksize=CHUNK_SIZE):

    chunk['PULocationID'] = chunk['PULocationID'].astype(str).str.strip()
    zone_df['location_id'] = zone_df['location_id'].astype(str).str.strip()

    chunk = chunk.merge(zone_df, how='left', left_on='PULocationID', right_on='location_id')
    chunk.rename(columns={'borough': 'PUBorough', 'zone': 'PUZone'}, inplace=True)

    chunk.dropna(subset=['PUBorough'], inplace=True)
    chunk.to_csv(merged2_data_path, mode='a', header=is_first_chunk, index=False)
    is_first_chunk = False

print(f"Merged 2 data saved to '{merged2_data_path}'")


Merged 2 data saved to '../raw_data/merged2_data.csv'
