In [1]:
import os
import pandas as pd
from glob import glob

In [2]:
# from utils.helpers import load_cfg 
import yaml


def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg


In [3]:
# from utils.minio_utils import MinIOClient

from minio import Minio


class MinIOClient:
    def __init__(self, endpoint_url, access_key, secret_key):
        self.endpoint_url = endpoint_url
        self.access_key = access_key
        self.secret_key = secret_key

    def create_conn(self):
        client = Minio(
            endpoint=self.endpoint_url,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=False,
        )
        return client

    def create_bucket(self, bucket_name):
        client = self.create_conn()

        # Create bucket if not exist
        found = client.bucket_exists(bucket_name=bucket_name)
        if not found:
            client.make_bucket(bucket_name=bucket_name)
            print(f"Bucket {bucket_name} created successfully!")
        else:
            print(f"Bucket {bucket_name} already exists, skip creating!")

    def list_parquet_files(self, bucket_name, prefix=""):
        client = self.create_conn()

        # List all objects in the bucket with the given prefix
        objects = client.list_objects(bucket_name, prefix=prefix, recursive=True)
        # Filter and collect Parquet file names
        parquet_files = [
            obj.object_name for obj in objects if obj.object_name.endswith(".parquet")
        ]

        return parquet_files


In [4]:
__file__ = os.getcwd()
print(__file__) 

e:\BigData\project_bigdata\MyProject\src\batch_processing


In [5]:
project_root = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
project_root

'e:\\BigData\\project_bigdata'

In [6]:
# DATA_PATH = os.path.join(project_root, "data")
DATA_PATH = os.path.join(project_root, "MyProject/data")
DATA_PATH

'e:\\BigData\\project_bigdata\\MyProject/data'

In [10]:
YEARS = ["2023", "2024"]
# TAXI_LOOKUP_PATH = os.path.join(project_root, "data", "taxi_lookup.csv")
TAXI_LOOKUP_PATH = os.path.join(project_root, "MyProject/data", "taxi_lookup.csv")
# CFG_FILE = os.path.join(project_root, "config", "datalake.yaml")
CFG_FILE = os.path.join(project_root, "MyProject/config", "datalake.yaml")

In [11]:
CFG_FILE

'e:\\BigData\\project_bigdata\\MyProject/config\\datalake.yaml'

In [7]:
def drop_column(df, file):
    """
        Drop columns 'store_and_fwd_flag'
    """
    if 'store_and_fwd_flag' in df.columns:
        df.drop(columns=['store_and_fwd_flag'])
        print("Dropped column store_and_fwd_flag from file: " + file)
    else:
        print("Column store_and_fwd_flag not found in file: " + file)
        
    return df

In [8]:

def merge_taxi_zone(df, file):
    """
        Merge dataset with taxi zone lookup
    """
    df_lookup = pd.read_csv(TAXI_LOOKUP_PATH)
    def merge_and_rename(df, location_id, lat_col, long_col):
        df = df.merge(df_lookup, left_on=location_id, right_on='LocationID')
        df = df.drop(columns=['LocationID', "Borough", "zone", "service_zone"])
        df = df.rename(columns={
            "latitude": lat_col,
            "longitude": long_col
        })
        return df
    
    if "pickup_latitude" not in df.columns:
        df = merge_and_rename(df, "pulocationid", "pickup_latitude", "pickup_longitude") 
    if "dropoff_latitude" not in df.columns:
        df = merge_and_rename(df, "dolocationid", "dropoff_latitude", "dropoff_longitude")

    print("Merged file: " + file)
    
    return df

In [12]:
TAXI_LOOKUP_PATH

'e:\\BigData\\project_bigdata\\MyProject/data\\taxi_lookup.csv'

In [13]:
taxi_lookup = pd.read_csv(TAXI_LOOKUP_PATH)
taxi_lookup

Unnamed: 0,LocationID,Borough,zone,service_zone,latitude,longitude
0,1,EWR,Newark Airport,EWR,40.689064,-74.177255
1,2,Queens,Jamaica Bay,Boro Zone,40.603994,-73.835412
2,3,Bronx,Allerton,Boro Zone,39.915319,-87.933215
3,4,Manhattan,Alphabet City,Yellow Zone,40.725102,-73.979583
4,5,Staten Island,Arden Heights,Boro Zone,53.284320,-7.492801
...,...,...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone,40.711900,-74.012527
261,262,Manhattan,Yorkville East,Yellow Zone,33.465410,-88.358724
262,263,Manhattan,Yorkville West,Yellow Zone,36.099513,-89.118957
263,264,Unknown,NV,,39.515882,-116.853722


In [14]:
def process(df, file):
    """
    Green:
        Rename column: lpep_pickup_datetime, lpep_dropoff_datetime, ehail_fee
        Drop: trip_type
    Yellow:
        Rename column: tpep_pickup_datetime, tpep_dropoff_datetime, airport_fee
    """
    
    if file.startswith("green"):
        # Rename columns
        df.rename(
            columns={
                "lpep_pickup_datetime": "pickup_datetime",
                "lpep_dropoff_datetime": "dropoff_datetime",
                "ehail_fee": "fee,"
            },
            inplace=True
        )
        
        # Drop columns
        if "trip_type" in df.columns:
            df.drop(columns=["trip_type"], inplace=True)
            
    elif file.startswith("yellow"):
        # Rename columns
        df.rename(
            columns={
                "tpep_pickup_datetime": "pickup_datetime",
                "tpep_dropoff_datetime": "dropoff_datetime",
                "airport_fee": "fee"
            },
            inplace=True
        )

    # fix data type in colums "payment_type", "dolocationid", 
    # "pulocationid", "vendorid" to int
    if "payment_type" in df.columns:
        df["payment_type"] = df["payment_type"].fillna(0.).astype(int)
    if "dolocationid" in df.columns:
        df["dolocationid"] = df["dolocationid"].astype(int)
    if "pulocationid" in df.columns:
        df["pulocationid"] = df["pulocationid"].astype(int)
    if "vendorid" in df.columns:
        df["vendorid"] = df["vendorid"].astype(int) 
    
    # drop column "fee"
    
    if "fee" in df.columns:
        df.drop(columns=["fee"], inplace=True)
    
    # Remove missing data
    df = df.dropna()    
    df = df.reindex(sorted(df.columns), axis=1)
    
    print("Transformed file: " + file)
    
    return df
    

In [16]:

def transform_data():
    
    """
        Transform data after loading into Datalake (MinIO)
    """
    import s3fs
    
    cfg = load_cfg(CFG_FILE)
    datalake_cfg = cfg["datalake"]
    
    s3_fs = s3fs.S3FileSystem(
        anon=False,
        key=datalake_cfg["access_key"],
        secret=datalake_cfg["secret_key"],
        client_kwargs={
            "endpoint_url": "".join(["http://", datalake_cfg["endpoint"]])
        }
    )
    
    client = MinIOClient(
        datalake_cfg["endpoint"],
        datalake_cfg["access_key"],
        datalake_cfg["secret_key"]
    )
    
    client.create_bucket(datalake_cfg["bucket_name_2"])
    
    for year in YEARS:
        all_fps = glob(os.path.join(DATA_PATH, year, "*.parquet"))
        for file in all_fps:
            
            file_name = os.path.basename(file)
            print(f"Reading parquet file: {file_name}")
            
            df = pd.read_parquet(file, engine="pyarrow")
            
            df.columns = df.columns.str.lower()
            check_df.append(df)
            df = drop_column(df, file_name)
            check_df.append(df)
            df = merge_taxi_zone(df, file_name)
            check_df.append(df)
            df = process(df, file_name)
            check_df.append(df)
            path = f"s3://{datalake_cfg['bucket_name_2']}/{year}/{file_name}"
            df.to_parquet(path, index=False, filesystem=s3_fs, engine="pyarrow")
            print("Finished transforming data in file: " + path)
            print("=" * 100)

        

In [17]:
check_df = []

In [18]:
check_df[1]

IndexError: list index out of range

In [19]:
if __name__ == "__main__":
    transform_data()

Bucket processed already exists, skip creating!
Reading parquet file: green_tripdata_2023-01.parquet
Dropped column store_and_fwd_flag from file: green_tripdata_2023-01.parquet
Merged file: green_tripdata_2023-01.parquet
Transformed file: green_tripdata_2023-01.parquet
Finished transforming data in file: s3://processed/2023/green_tripdata_2023-01.parquet
Reading parquet file: yellow_tripdata_2023-01.parquet
Dropped column store_and_fwd_flag from file: yellow_tripdata_2023-01.parquet
Merged file: yellow_tripdata_2023-01.parquet
Transformed file: yellow_tripdata_2023-01.parquet
Finished transforming data in file: s3://processed/2023/yellow_tripdata_2023-01.parquet
Reading parquet file: green_tripdata_2024-01.parquet
Dropped column store_and_fwd_flag from file: green_tripdata_2024-01.parquet
Merged file: green_tripdata_2024-01.parquet
Transformed file: green_tripdata_2024-01.parquet
Finished transforming data in file: s3://processed/2024/green_tripdata_2024-01.parquet
Reading parquet file

In [46]:
check_df[0]

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.00
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.00
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,0.5,1.5,1.70,0.0,,1.0,10.20,1.0,1.0,0.00
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,0.5,1.5,0.00,0.0,,1.0,8.00,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,0.0,0.0,0.00,0.0,,1.0,16.70,,,
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,0.0,0.0,0.00,0.0,,1.0,5.41,,,
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,0.0,0.0,3.51,0.0,,1.0,21.04,,,
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,0.0,0.0,3.20,0.0,,1.0,19.18,,,


In [48]:
check_df[1]

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.00
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.00
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,0.5,1.5,1.70,0.0,,1.0,10.20,1.0,1.0,0.00
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,0.5,1.5,0.00,0.0,,1.0,8.00,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,0.0,0.0,0.00,0.0,,1.0,16.70,,,
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,0.0,0.0,0.00,0.0,,1.0,5.41,,,
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,0.0,0.0,3.51,0.0,,1.0,21.04,,,
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,0.0,0.0,3.20,0.0,,1.0,19.18,,,


In [49]:
check_df[2]

In [42]:
len(check_df)

3

In [36]:
check_df[2]

In [53]:
test_df = pd.read_parquet("../../data/2023/green_tripdata_2023-01.parquet", engine="pyarrow")

In [56]:
test_df.columns = test_df.columns.str.lower()

In [81]:
test_df["payment_type"].unique()

array([ 1.,  2.,  3.,  4.,  5., nan])

In [57]:
test_df[0:5]

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,0.5,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,0.5,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0


In [80]:
process(test_df, "green_tripdata_2023-01.parquet")

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [55]:
test_df["VendorID"].unique()

array([2, 1])

In [64]:
df_lookup = pd.read_csv(TAXI_LOOKUP_PATH)

In [65]:
df_lookup

Unnamed: 0,LocationID,Borough,zone,service_zone,latitude,longitude
0,1,EWR,Newark Airport,EWR,40.689064,-74.177255
1,2,Queens,Jamaica Bay,Boro Zone,40.603994,-73.835412
2,3,Bronx,Allerton,Boro Zone,39.915319,-87.933215
3,4,Manhattan,Alphabet City,Yellow Zone,40.725102,-73.979583
4,5,Staten Island,Arden Heights,Boro Zone,53.284320,-7.492801
...,...,...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone,40.711900,-74.012527
261,262,Manhattan,Yorkville East,Yellow Zone,33.465410,-88.358724
262,263,Manhattan,Yorkville West,Yellow Zone,36.099513,-89.118957
263,264,Unknown,NV,,39.515882,-116.853722


In [60]:
df_merge = test_df.merge(df_lookup, left_on="pulocationid", right_on='LocationID')

In [67]:
merge_taxi_zone(test_df, "test")

Merged file: test


Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,...,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,...,,1.0,24.18,1.0,1.0,2.75,40.810000,-73.962500,45.449932,-122.724466
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,...,,1.0,15.84,1.0,1.0,0.00,41.957529,-88.080904,40.782773,-73.965363
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,...,,1.0,11.64,1.0,1.0,0.00,40.774546,-73.903748,46.188355,-123.825396
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,...,,1.0,10.20,1.0,1.0,0.00,22.921100,-83.194037,-35.023508,138.676646
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,...,,1.0,8.00,1.0,1.0,0.00,22.921100,-83.194037,41.982343,-87.807073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,...,,1.0,16.70,,,,40.689722,-73.965278,51.267356,-1.085614
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,...,,1.0,5.41,,,,42.203861,-83.173118,51.753051,-0.337967
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,...,,1.0,21.04,,,,40.702905,-73.990118,40.694287,-73.925884
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,...,,1.0,19.18,,,,42.387863,-73.775751,42.095305,-87.937569


In [62]:
df_merge = df_merge.drop(columns=['LocationID', "Borough", "zone", "service_zone"])


In [63]:
df_merge

Unnamed: 0.1,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,Unnamed: 0,latitude,longitude
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,...,0.0,,1.0,24.18,1.0,1.0,2.75,165,40.810000,-73.962500
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,...,0.0,,1.0,15.84,1.0,1.0,0.00,23,41.957529,-88.080904
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,...,0.0,,1.0,11.64,1.0,1.0,0.00,222,40.774546,-73.903748
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,...,0.0,,1.0,10.20,1.0,1.0,0.00,40,22.921100,-83.194037
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,...,0.0,,1.0,8.00,1.0,1.0,0.00,40,22.921100,-83.194037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,...,0.0,,1.0,16.70,,,,48,40.689722,-73.965278
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,...,0.0,,1.0,5.41,,,,9,42.203861,-83.173118
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,...,0.0,,1.0,21.04,,,,65,40.702905,-73.990118
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,...,0.0,,1.0,19.18,,,,224,42.387863,-73.775751


In [1]:
from kafka import KafkaConsumer
import json
topic_streaming_2 = "streaming.public.green_trip_raw"
consumer_2 = KafkaConsumer(
    topic_streaming_2,
    bootstrap_servers="localhost:9092",
    auto_offset_reset="earliest",  # Hoặc 'latest'
    value_deserializer=lambda x: json.loads(x.decode("utf-8")),  # Giải mã JSON
)

In [2]:
for message in consumer_2:
    print(message.value)
    break

{'schema': {'type': 'struct', 'fields': [{'type': 'struct', 'fields': [{'type': 'int32', 'optional': True, 'field': 'vendorid'}, {'type': 'int64', 'optional': True, 'name': 'io.debezium.time.MicroTimestamp', 'version': 1, 'field': 'lpep_pickup_datetime'}, {'type': 'int64', 'optional': True, 'name': 'io.debezium.time.MicroTimestamp', 'version': 1, 'field': 'lpep_dropoff_datetime'}, {'type': 'string', 'optional': True, 'field': 'store_and_fwd_flag'}, {'type': 'double', 'optional': True, 'field': 'ratecodeid'}, {'type': 'int32', 'optional': True, 'field': 'pulocationid'}, {'type': 'int32', 'optional': True, 'field': 'dolocationid'}, {'type': 'double', 'optional': True, 'field': 'passenger_count'}, {'type': 'double', 'optional': True, 'field': 'trip_distance'}, {'type': 'double', 'optional': True, 'field': 'fare_amount'}, {'type': 'double', 'optional': True, 'field': 'extra'}, {'type': 'double', 'optional': True, 'field': 'mta_tax'}, {'type': 'double', 'optional': True, 'field': 'tip_amoun

In [None]:
df_merge = df_merge.rename(columns={
    "latitude": lat_col,
    "longitude": long_col
})