In [5]:
# Importing libraries
import pyarrow.parquet as pq
import pandas as pd

In [6]:
# Function to process the parquet files and return a single dataframe
def process_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        table = pq.read_table(file_path)# Reading the parquet file as a table
        df = table.to_pandas()# Converting the table to a pandas dataframe
        dataframes.append(df)# Appending the dataframe to the list
    
    # Standardizing column names based on the first dataframe
    col_names_std = dataframes[0].columns.tolist()
    for df in dataframes:
        df.columns = col_names_std
    
    # Printing size of the dataframes
    for i, df in enumerate(dataframes):
        print(f"Shape of dataframe {i+1}: ", df.shape)
    print("Total Size of all dataframes: ", sum([df.shape[0] for df in dataframes]))

    # Concatenating the dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Shape of combined dataframe: ", combined_df.shape)
    
    return combined_df

In [7]:
# Reading the parquet files
file_paths = ['D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-01.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-02.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-03.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-04.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-05.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-06.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-07.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-08.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-09.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-10.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-11.parquet',
              'D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Monthly\y2021\yellow_tripdata_2021-12.parquet'
              ]
yellow_tripdata_2021 = process_data(file_paths)

Shape of dataframe 1:  (1369769, 19)
Shape of dataframe 2:  (1371709, 19)
Shape of dataframe 3:  (1925152, 19)
Shape of dataframe 4:  (2171187, 19)
Shape of dataframe 5:  (2507109, 19)
Shape of dataframe 6:  (2834264, 19)
Shape of dataframe 7:  (2821746, 19)
Shape of dataframe 8:  (2788757, 19)
Shape of dataframe 9:  (2963793, 19)
Shape of dataframe 10:  (3463504, 19)
Shape of dataframe 11:  (3472949, 19)
Shape of dataframe 12:  (3214369, 19)
Total Size of all dataframes:  30904308
Shape of combined dataframe:  (30904308, 19)


In [8]:
# Checking for null values
nan_in_each_column = yellow_tripdata_2021.isna().sum()
print(nan_in_each_column)

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          1478695
trip_distance                  0
RatecodeID               1478695
store_and_fwd_flag       1478695
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     1478695
airport_fee              5641418
dtype: int64


In [9]:
# Filling null values
yellow_tripdata_2021['congestion_surcharge'] = yellow_tripdata_2021['congestion_surcharge'].fillna(0)
yellow_tripdata_2021['airport_fee'] = yellow_tripdata_2021['airport_fee'].fillna(0)
yellow_tripdata_2021['passenger_count'] = yellow_tripdata_2021['passenger_count'].fillna(0)
yellow_tripdata_2021['RatecodeID'] = yellow_tripdata_2021['RatecodeID'].fillna(1.0)
yellow_tripdata_2021['store_and_fwd_flag'] = yellow_tripdata_2021['store_and_fwd_flag'].fillna('N')

In [10]:
# Checking for null values again
nan_in_each_column = yellow_tripdata_2021.isna().sum()
print(nan_in_each_column)

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [11]:
# Checking datatypes
yellow_tripdata_2021.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [12]:
# Converting datatypes
yellow_tripdata_2021['passenger_count'] = yellow_tripdata_2021['passenger_count'].astype('int64')
yellow_tripdata_2021['RatecodeID'] = yellow_tripdata_2021['RatecodeID'].astype('int64')
yellow_tripdata_2021['store_and_fwd_flag'] = yellow_tripdata_2021['store_and_fwd_flag'].astype(str)

In [None]:
# Checking datatypes again
yellow_tripdata_2021.dtypes

In [13]:
# Checking for invalid values
mask1 = ~yellow_tripdata_2021['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2021['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2021['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2021['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

123917
11538
0
1478695


In [14]:
# Dropping invalid values for VendorID
yellow_tripdata_2022 = yellow_tripdata_2021.drop(yellow_tripdata_2021[mask1].index)

In [15]:
# Replace the values in RatecodeID column with 1 if they are invalid
yellow_tripdata_2021.loc[mask2, 'RatecodeID'] = 1

In [16]:
# Replace the values in payment_type column with 5 if they are invalid
yellow_tripdata_2021.loc[mask4, 'payment_type'] = 5

In [None]:
# Checking for invalid values again
mask1 = ~yellow_tripdata_2021['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2021['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2021['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2021['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

In [17]:
# Total no. of rows after dropping invalid values
yellow_tripdata_2021.shape[0]

30904308

In [18]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2021.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.1,1,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,0.0
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.2,1,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.7,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.6,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,0.0
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1,1.6,1,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,0.0
6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1,4.1,1,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,0.0
7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1,5.7,1,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,0.0
8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1,9.1,1,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,0.0
9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2,2.7,1,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,0.0


In [19]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2021.tail(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
30904298,2,2021-12-31 23:37:00,2022-01-01 00:03:00,0,8.84,1,N,152,173,5,39.15,0.0,0.5,0.0,6.55,0.3,46.5,0.0,0.0
30904299,2,2021-12-31 23:18:00,2021-12-31 23:30:00,0,1.53,1,N,79,211,5,13.2,0.0,0.5,0.75,0.0,0.3,17.25,0.0,0.0
30904300,2,2021-12-31 23:11:43,2021-12-31 23:35:19,0,5.13,1,N,164,41,5,21.07,0.0,0.5,5.25,0.0,0.3,29.62,0.0,0.0
30904301,2,2021-12-31 23:16:00,2021-12-31 23:39:00,0,3.83,1,N,90,236,5,17.66,0.0,0.5,4.49,0.0,0.3,25.45,0.0,0.0
30904302,2,2021-12-31 23:09:17,2021-12-31 23:15:10,0,0.95,1,N,236,141,5,13.2,0.0,0.5,4.38,0.0,0.3,20.88,0.0,0.0
30904303,2,2021-12-31 23:18:02,2021-12-31 23:32:19,0,3.99,1,N,145,107,5,17.42,0.0,0.5,5.8,0.0,0.3,26.52,0.0,0.0
30904304,2,2021-12-31 23:46:00,2022-01-01 00:07:00,0,5.96,1,N,236,158,5,23.05,0.0,0.5,2.85,0.0,0.3,29.2,0.0,0.0
30904305,2,2021-12-31 23:46:00,2022-01-01 00:04:00,0,4.99,1,N,48,13,5,20.97,0.0,0.5,5.23,0.0,0.3,29.5,0.0,0.0
30904306,2,2021-12-31 23:53:00,2022-01-01 00:02:00,0,1.36,1,N,75,75,5,13.2,0.0,0.5,2.0,0.0,0.3,16.0,0.0,0.0
30904307,2,2021-12-31 23:19:28,2021-12-31 23:26:30,0,1.99,1,N,170,237,5,13.2,0.0,0.5,2.63,0.0,0.3,19.13,0.0,0.0


In [20]:
# Writing the dataframe to a parquet file
yellow_tripdata_2021.to_parquet('D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\Yearly\yellow_tripdata_2021.parquet.gzip', compression='gzip')