In [1]:
# Importing libraries
import pandas as pd

In [2]:
import pandas as pd

def process_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        df = pd.read_parquet(file_path)  # Reading the parquet file as a dataframe
        dataframes.append(df)  # Appending the dataframe to the list
    
    # Standardizing column names based on the first dataframe
    col_names_std = dataframes[0].columns.tolist()
    for df in dataframes:
        df.columns = col_names_std
    
    # Printing size of the dataframes
    for i, df in enumerate(dataframes):
        print(f"Shape of dataframe {i+1}: ", df.shape)
    print("Total Size of all dataframes: ", sum([df.shape[0] for df in dataframes]))

    # Concatenating the dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Shape of combined dataframe: ", combined_df.shape)
    
    return combined_df


In [3]:
# Reading the parquet files
file_paths = ['https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-01.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-02.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-03.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-04.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-05.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-06.parquet',
              'https://storage.googleapis.com/tlc-nyc-data-analytics/y2023/yellow_tripdata_2023-07.parquet',
              ]
yellow_tripdata_2023 = process_data(file_paths)

Shape of dataframe 1:  (3066766, 19)
Shape of dataframe 2:  (2913955, 19)
Shape of dataframe 3:  (3403766, 19)
Shape of dataframe 4:  (3288250, 19)
Shape of dataframe 5:  (3513649, 19)
Shape of dataframe 6:  (3307234, 19)
Total Size of all dataframes:  19493620
Shape of combined dataframe:  (19493620, 19)


In [None]:
# Checking for null values
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

In [None]:
# Filling null values
yellow_tripdata_2023['congestion_surcharge'] = yellow_tripdata_2023['congestion_surcharge'].fillna(0)
yellow_tripdata_2023['airport_fee'] = yellow_tripdata_2023['airport_fee'].fillna(0)
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].fillna(0)
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].fillna(1.0)
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].fillna('N')

In [None]:
# Checking for null values again
nan_in_each_column = yellow_tripdata_2023.isna().sum()
print(nan_in_each_column)

In [None]:
# Checking datatypes
yellow_tripdata_2023.dtypes

In [None]:
# Converting datatypes
yellow_tripdata_2023['passenger_count'] = yellow_tripdata_2023['passenger_count'].astype('int64')
yellow_tripdata_2023['RatecodeID'] = yellow_tripdata_2023['RatecodeID'].astype('int64')
yellow_tripdata_2023['store_and_fwd_flag'] = yellow_tripdata_2023['store_and_fwd_flag'].astype(str)

In [None]:
# Checking datatypes again
yellow_tripdata_2023.dtypes

In [None]:
# Checking for invalid values
mask1 = ~yellow_tripdata_2023['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2023['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2023['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2023['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

In [None]:
# Dropping invalid values for VendorID
yellow_tripdata_2023 = yellow_tripdata_2023.drop(yellow_tripdata_2023[mask1].index)

In [None]:
# Replace the values in RatecodeID column with 1 if they are invalid
yellow_tripdata_2023.loc[mask2, 'RatecodeID'] = 1

In [None]:
# Replace the values in payment_type column with 5 if they are invalid
yellow_tripdata_2023.loc[mask4, 'payment_type'] = 5

In [None]:
# Checking for invalid values again
mask1 = ~yellow_tripdata_2023['VendorID'].isin([1, 2]) # Check 'VendorID' column contains values other than 1 and 2
mask2 = ~yellow_tripdata_2023['RatecodeID'].isin([1, 2, 3, 4, 5, 6]) # Check 'RatecodeID' column contains values other than 1, 2, 3, 4, 5, and 6
mask3 = ~yellow_tripdata_2023['store_and_fwd_flag'].isin(['Y', 'N']) # Check if 'store_and_fwd_flag' column contains values other than 'Y' and 'N'
mask4 = ~yellow_tripdata_2023['payment_type'].isin([1, 2, 3, 4, 5, 6]) # Check if 'payment_type' column contains values other than 1, 2, 3, 4, 5, and 6
print(mask1.sum())
print(mask2.sum())
print(mask3.sum())
print(mask4.sum())

In [None]:
# Total no. of rows after dropping invalid values
yellow_tripdata_2023.shape[0]

In [None]:
# Shorting the dataframe removing some random data to reduce the size
# 
print("Size before: ",yellow_tripdata_2023.shape[0])
n = 9488525
if n < len(yellow_tripdata_2023):
    drop_indices = yellow_tripdata_2023.sample(n, random_state=42).index
    yellow_tripdata_2023 = yellow_tripdata_2023.drop(drop_indices)
else:
    print(f"Your DataFrame has only {len(yellow_tripdata_2023)} rows. Can't drop {n} rows.")
print("Size After: ",yellow_tripdata_2023.shape[0])

In [None]:
# Dropping duplicates
print("Size before: ",yellow_tripdata_2023.shape[0])
yellow_tripdata_2023 = yellow_tripdata_2023.drop_duplicates().reset_index(drop=True)
print("Size After: ",yellow_tripdata_2023.shape[0])

In [None]:
# Viewing info of the dataframe
yellow_tripdata_2023.info()

In [None]:
# Viewing description of the dataframe
yellow_tripdata_2023.describe()

In [None]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2023.head(10)

In [None]:
# Viewing the dataframe after preprocessing
yellow_tripdata_2023.tail(10)

In [None]:
# Writing the dataframe to a parquet file
yellow_tripdata_2023.to_parquet('D:\TLC-NYC-Data-Analytics-End-To-End\env\Data\yellow_tripdata_2023.parquet.gzip', compression='gzip')