In [1]:
# Importing libraries
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

In [2]:
def process_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        # Reading the parquet file as a table
        table = pq.read_table(file_path)
        # Converting the table to a pandas dataframe
        df = table.to_pandas()
        # Appending the dataframe to the list
        dataframes.append(df)
    
    # Standardizing column names based on the first dataframe
    col_names_std = dataframes[0].columns.tolist()
    for df in dataframes:
        df.columns = col_names_std
    
    # Printing size of the dataframes
    for i, df in enumerate(dataframes):
        print(f"Shape of dataframe {i+1}: ", df.shape)
    print("Total Size of all dataframes: ", sum([df.shape[0] for df in dataframes]))
    """
    # Printing head and tail of each dataframe
    for i, df in enumerate(dataframes):
        print(f"Head of dataframe {i+1}:")
        print(df.head())
        print(f"Tail of dataframe {i+1}:")
        print(df.tail())
    """
    # Concatenating the dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Shape of combined dataframe: ", combined_df.shape)
    
    return combined_df

In [3]:
file_paths = [
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-01.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-02.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-03.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-04.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-05.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-06.parquet',
    'D:\TLC-NYC-Data-Analytics-End-To-End\env\Src\Data\Monthly\yellow_tripdata_2023-07.parquet',
]
yellow_tripdata_2023 = process_data(file_paths)

Shape of dataframe 1:  (3066766, 19)
Shape of dataframe 2:  (2913955, 19)
Shape of dataframe 3:  (3403766, 19)
Shape of dataframe 4:  (3288250, 19)
Shape of dataframe 5:  (3513649, 19)
Shape of dataframe 6:  (3307234, 19)
Shape of dataframe 7:  (2907108, 19)
Total Size of all dataframes:  22400728
Shape of combined dataframe:  (22400728, 19)


In [4]:
yellow_tripdata_2023.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [5]:
yellow_tripdata_2023.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
22400723,2,2023-07-31 23:18:11,2023-07-31 23:18:23,,0.0,,,106,106,0,21.58,0.0,0.5,0.0,0.0,1.0,23.08,,
22400724,2,2023-07-31 23:29:31,2023-07-31 23:51:17,,4.97,,,238,243,0,22.12,0.0,0.5,0.0,0.0,1.0,23.62,,
22400725,2,2023-07-31 23:16:57,2023-07-31 23:17:25,,0.03,,,141,141,0,23.93,0.0,0.5,0.0,0.0,1.0,27.93,,
22400726,2,2023-07-31 23:29:00,2023-07-31 23:46:00,,3.24,,,65,62,0,9.54,0.0,0.5,3.0,0.0,1.0,14.04,,
22400727,2,2023-07-31 23:34:13,2023-07-31 23:48:51,,2.08,,,162,143,0,14.33,0.0,0.5,0.0,0.0,1.0,18.33,,


In [7]:
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count          613638
trip_distance                 0
RatecodeID               613638
store_and_fwd_flag       613638
PULocationID                  0
DOLocationID                  0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge     613638
airport_fee              613638
dtype: int64


In [8]:
yellow_tripdata_2023['congestion_surcharge'] = yellow_tripdata_2023['congestion_surcharge'].fillna(0)
yellow_tripdata_2023['airport_fee'] = yellow_tripdata_2023['airport_fee'].fillna(0)
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].fillna(0)
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].fillna(1.0)
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].fillna('N')

In [9]:
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [10]:
yellow_tripdata_2023.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
22400723,2,2023-07-31 23:18:11,2023-07-31 23:18:23,0.0,0.0,1.0,N,106,106,0,21.58,0.0,0.5,0.0,0.0,1.0,23.08,0.0,0.0
22400724,2,2023-07-31 23:29:31,2023-07-31 23:51:17,0.0,4.97,1.0,N,238,243,0,22.12,0.0,0.5,0.0,0.0,1.0,23.62,0.0,0.0
22400725,2,2023-07-31 23:16:57,2023-07-31 23:17:25,0.0,0.03,1.0,N,141,141,0,23.93,0.0,0.5,0.0,0.0,1.0,27.93,0.0,0.0
22400726,2,2023-07-31 23:29:00,2023-07-31 23:46:00,0.0,3.24,1.0,N,65,62,0,9.54,0.0,0.5,3.0,0.0,1.0,14.04,0.0,0.0
22400727,2,2023-07-31 23:34:13,2023-07-31 23:48:51,0.0,2.08,1.0,N,162,143,0,14.33,0.0,0.5,0.0,0.0,1.0,18.33,0.0,0.0


In [11]:
yellow_tripdata_2023.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [16]:
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].astype('int64')
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].astype('int64')
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].astype(str)

In [17]:
yellow_tripdata_2023.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                   int64
trip_distance                   float64
RatecodeID                        int64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [18]:
yellow_tripdata_2023.tail(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
22400718,2,2023-07-31 23:35:12,2023-08-01 00:00:39,0,10.7,1,N,255,74,0,39.27,0.0,0.5,0.0,6.55,1.0,47.32,0.0,0.0
22400719,2,2023-07-31 23:11:00,2023-07-31 23:49:00,0,7.46,1,N,143,256,0,35.38,0.0,0.5,0.0,0.0,1.0,39.38,0.0,0.0
22400720,2,2023-07-31 23:16:00,2023-07-31 23:29:00,0,2.85,1,N,68,162,0,15.78,0.0,0.5,3.96,0.0,1.0,23.74,0.0,0.0
22400721,2,2023-07-31 23:01:00,2023-07-31 23:19:00,0,0.92,1,N,142,224,0,22.95,0.0,0.5,5.39,0.0,1.0,32.34,0.0,0.0
22400722,2,2023-07-31 23:55:10,2023-07-31 23:58:40,0,1.0,1,N,61,188,0,14.44,0.0,0.5,0.0,0.0,1.0,15.94,0.0,0.0
22400723,2,2023-07-31 23:18:11,2023-07-31 23:18:23,0,0.0,1,N,106,106,0,21.58,0.0,0.5,0.0,0.0,1.0,23.08,0.0,0.0
22400724,2,2023-07-31 23:29:31,2023-07-31 23:51:17,0,4.97,1,N,238,243,0,22.12,0.0,0.5,0.0,0.0,1.0,23.62,0.0,0.0
22400725,2,2023-07-31 23:16:57,2023-07-31 23:17:25,0,0.03,1,N,141,141,0,23.93,0.0,0.5,0.0,0.0,1.0,27.93,0.0,0.0
22400726,2,2023-07-31 23:29:00,2023-07-31 23:46:00,0,3.24,1,N,65,62,0,9.54,0.0,0.5,3.0,0.0,1.0,14.04,0.0,0.0
22400727,2,2023-07-31 23:34:13,2023-07-31 23:48:51,0,2.08,1,N,162,143,0,14.33,0.0,0.5,0.0,0.0,1.0,18.33,0.0,0.0


In [None]:
yellow_tripdata_2023.to_parquet('yellow_tripdata_2023.parquet.gzip', compression='gzip')