In [1]:
# Importing libraries
import pandas as pd
import pyarrow.parquet as pq

In [2]:
import pandas as pd

def process_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        df = pd.read_parquet(file_path)  # Reading the parquet file as a dataframe
        dataframes.append(df)  # Appending the dataframe to the list
    
    # Standardizing column names based on the first dataframe
    col_names_std = dataframes[0].columns.tolist()
    for df in dataframes:
        df.columns = col_names_std
    
    # Printing size of the dataframes
    for i, df in enumerate(dataframes):
        print(f"Shape of dataframe {i+1}: ", df.shape)
    print("Total Size of all dataframes: ", sum([df.shape[0] for df in dataframes]))

    # Concatenating the dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Shape of combined dataframe: ", combined_df.shape)
    
    return combined_df


In [3]:
# Reading the parquet files
file_paths = ['https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-01.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-02.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-03.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-04.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-05.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-06.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-07.parquet',
              ]
yellow_tripdata_2023 = process_data(file_paths)

Shape of dataframe 1:  (3066766, 19)
Shape of dataframe 2:  (2913955, 19)
Shape of dataframe 3:  (3403766, 19)
Shape of dataframe 4:  (3288250, 19)
Shape of dataframe 5:  (3513649, 19)
Shape of dataframe 6:  (3307234, 19)
Shape of dataframe 7:  (2907108, 19)
Total Size of all dataframes:  22400728
Shape of combined dataframe:  (22400728, 19)


In [4]:
# Checking for null values
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count          613638
trip_distance                 0
RatecodeID               613638
store_and_fwd_flag       613638
PULocationID                  0
DOLocationID                  0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge     613638
airport_fee              613638
dtype: int64


In [5]:
# Filling null values
yellow_tripdata_2023['congestion_surcharge'] = yellow_tripdata_2023['congestion_surcharge'].fillna(0)
yellow_tripdata_2023['airport_fee'] = yellow_tripdata_2023['airport_fee'].fillna(0)
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].fillna(0)
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].fillna(1.0)
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].fillna('N')

In [6]:
# Checking for null values again
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [7]:
# Checking datatypes
yellow_tripdata_2023.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [8]:
# Converting datatypes
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].astype('int64')
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].astype('int64')
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].astype(str)

In [9]:
# Checking datatypes again
yellow_tripdata_2023.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                   int64
trip_distance                   float64
RatecodeID                        int64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [10]:
# Checking for invalid values
mask1 = ~yellow_tripdata_2023['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2023['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2023['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2023['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

5820
110397
0
613638


In [11]:
# Dropping invalid values for VendorID
yellow_tripdata_2023 = yellow_tripdata_2023.drop(yellow_tripdata_2023[mask1].index)

In [12]:
# Replace the values in RatecodeID column with 1 if they are invalid
yellow_tripdata_2023.loc[mask2, 'RatecodeID'] = 1

In [13]:
# Replace the values in payment_type column with 5 if they are invalid
yellow_tripdata_2023.loc[mask4, 'payment_type'] = 5

In [14]:
# Checking for invalid values again
mask1 = ~yellow_tripdata_2023['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2023['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2023['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2023['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

0
0
0
0


In [15]:
# Total no. of rows after dropping invalid values
yellow_tripdata_2023.shape[0]

22394908

In [16]:
# Shorting the dataframe removing some random data to reduce the size
# Reducing size for dataset to fit in google compute engine and mage engine
print("Size before: ",yellow_tripdata_2023.shape[0])
n = 22294908
if n < len(yellow_tripdata_2023):
    drop_indices = yellow_tripdata_2023.sample(n, random_state=42).index
    yellow_tripdata_2023 = yellow_tripdata_2023.drop(drop_indices)
else:
    print(f"Your DataFrame has only {len(yellow_tripdata_2023)} rows. Can't drop {n} rows.")
print("Size After: ",yellow_tripdata_2023.shape[0])

Size before:  22394908
Size After:  100000


In [17]:
# Dropping duplicates
print("Size before: ",yellow_tripdata_2023.shape[0])
yellow_tripdata_2023 = yellow_tripdata_2023.drop_duplicates().reset_index(drop=True)
print("Size After: ",yellow_tripdata_2023.shape[0])

Size before:  100000
Size After:  100000


In [18]:
# Viewing info of the dataframe
yellow_tripdata_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               100000 non-null  int64         
 1   tpep_pickup_datetime   100000 non-null  datetime64[us]
 2   tpep_dropoff_datetime  100000 non-null  datetime64[us]
 3   passenger_count        100000 non-null  int64         
 4   trip_distance          100000 non-null  float64       
 5   RatecodeID             100000 non-null  int64         
 6   store_and_fwd_flag     100000 non-null  object        
 7   PULocationID           100000 non-null  int64         
 8   DOLocationID           100000 non-null  int64         
 9   payment_type           100000 non-null  int64         
 10  fare_amount            100000 non-null  float64       
 11  extra                  100000 non-null  float64       
 12  mta_tax                100000 non-null  float

In [19]:
# Viewing description of the dataframe
yellow_tripdata_2023.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,100000.0,100000,100000,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1.73432,2023-04-17 02:35:25.601529,2023-04-17 02:52:26.804570,1.33136,3.865234,1.07184,165.57997,163.84454,1.32949,19.244247,1.585316,0.486694,3.493627,0.581394,0.9809,28.149322,2.207175,0.12656
min,1.0,2002-12-31 23:18:49,2002-12-31 23:33:34,0.0,0.0,1.0,1.0,1.0,1.0,-368.0,-7.5,-0.5,-16.11,-32.05,-1.0,-379.05,-2.5,-1.75
25%,1.0,2023-02-25 01:07:20,2023-02-25 01:19:08.250000,1.0,1.07,1.0,132.0,113.0,1.0,9.3,0.0,0.5,1.0,0.0,1.0,15.76,2.5,0.0
50%,2.0,2023-04-18 07:33:11,2023-04-18 07:49:49,1.0,1.8,1.0,162.0,162.0,1.0,13.5,1.0,0.5,2.8,0.0,1.0,20.845,2.5,0.0
75%,2.0,2023-06-06 14:54:04.750000,2023-06-06 15:11:36.250000,1.0,3.45,1.0,234.0,234.0,1.0,21.2,2.5,0.5,4.35,0.0,1.0,30.24,2.5,0.0
max,2.0,2023-07-31 23:57:48,2023-08-01 00:32:17,6.0,22562.67,5.0,265.0,265.0,5.0,400.0,11.75,0.8,99.99,65.0,1.0,479.21,2.5,1.75
std,0.441697,,,0.915166,85.875695,0.380185,63.957093,69.93151,0.794854,18.633857,1.840519,0.104393,3.970871,2.169677,0.191806,23.306779,0.855686,0.436847


In [20]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2023.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:23:52,2023-01-01 00:41:20,1,3.24,1,N,229,246,1,19.1,1.0,0.5,3.0,0.0,1.0,27.1,2.5,0.0
1,2,2023-01-01 00:46:38,2023-01-01 01:02:30,1,6.99,1,N,24,243,1,30.3,1.0,0.5,6.56,0.0,1.0,39.36,0.0,0.0
2,2,2023-01-01 00:35:14,2023-01-01 00:41:06,1,1.21,1,N,142,239,1,7.9,1.0,0.5,2.58,0.0,1.0,15.48,2.5,0.0
3,2,2023-01-01 00:09:48,2023-01-01 00:24:58,1,4.79,1,N,107,87,1,22.6,1.0,0.5,5.0,0.0,1.0,32.6,2.5,0.0
4,2,2023-01-01 00:21:52,2023-01-01 00:41:57,4,2.54,1,N,246,234,1,19.8,1.0,0.5,2.48,0.0,1.0,27.28,2.5,0.0
5,1,2023-01-01 00:05:27,2023-01-01 00:09:40,2,1.2,1,N,162,141,1,7.9,3.5,0.5,2.55,0.0,1.0,15.45,2.5,0.0
6,1,2023-01-01 00:20:48,2023-01-01 00:23:57,1,0.6,1,N,249,90,1,5.8,3.5,0.5,2.16,0.0,1.0,12.96,2.5,0.0
7,2,2023-01-01 00:26:38,2023-01-01 00:33:36,1,0.82,1,N,114,113,1,7.9,1.0,0.5,2.58,0.0,1.0,15.48,2.5,0.0
8,2,2023-01-01 00:19:57,2023-01-01 00:46:24,2,4.27,1,N,246,151,1,28.2,1.0,0.5,2.0,0.0,1.0,35.2,2.5,0.0
9,2,2023-01-01 00:12:33,2023-01-01 00:47:51,3,1.78,1,N,162,90,1,28.9,1.0,0.5,10.17,0.0,1.0,44.07,2.5,0.0


In [21]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2023.tail(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
99990,1,2023-07-31 13:31:01,2023-07-31 13:40:01,0,1.2,1,N,162,236,5,10.0,0.0,0.5,2.1,0.0,1.0,16.1,0.0,0.0
99991,1,2023-07-31 14:21:03,2023-07-31 14:31:51,0,1.5,1,N,163,50,5,10.7,0.0,0.5,2.2,0.0,1.0,16.9,0.0,0.0
99992,2,2023-07-31 14:20:15,2023-07-31 14:47:20,0,3.07,1,N,79,48,5,21.74,0.0,0.5,5.15,0.0,1.0,30.89,0.0,0.0
99993,2,2023-07-31 17:36:19,2023-07-31 17:44:25,0,1.67,1,N,238,166,5,9.68,0.0,0.5,0.0,0.0,1.0,13.68,0.0,0.0
99994,2,2023-07-31 17:29:20,2023-07-31 17:58:48,0,8.74,1,N,75,13,5,3.6,0.0,0.5,0.0,0.0,1.0,7.6,0.0,0.0
99995,2,2023-07-31 19:05:11,2023-07-31 19:12:03,0,1.06,1,N,75,74,5,11.0,0.0,0.5,2.5,0.0,1.0,15.0,0.0,0.0
99996,2,2023-07-31 20:16:18,2023-07-31 20:26:11,0,3.38,1,N,148,229,5,14.74,0.0,0.5,3.75,0.0,1.0,22.49,0.0,0.0
99997,2,2023-07-31 21:59:20,2023-07-31 22:15:01,0,2.77,1,N,230,263,5,17.81,0.0,0.5,4.36,0.0,1.0,26.17,0.0,0.0
99998,2,2023-07-31 22:25:28,2023-07-31 22:43:27,0,4.38,1,N,137,238,5,20.05,0.0,0.5,0.0,0.0,1.0,24.05,0.0,0.0
99999,1,2023-07-31 22:41:31,2023-07-31 23:29:44,0,18.1,1,N,132,237,5,70.0,1.75,0.5,8.23,6.55,1.0,90.53,0.0,0.0


In [22]:
# Writing the dataframe to a parquet file
yellow_tripdata_2023.to_parquet('D:\TLC-NYC-Data-Analytics-End-To-End\env\yellow_tripdata_2023.parquet.gzip', compression='gzip')
print("Dataset Saved")

Dataset Saved


In [23]:
# Writing the dataframe to a csv file
yellow_tripdata_2023.to_csv('D:\TLC-NYC-Data-Analytics-End-To-End\env\yellow_tripdata_2023.csv', index=False)
print("Dataset Saved")

Dataset Saved
