In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
API_KEY = os.getenv("OPENWEATHER_API_KEY")
ACCESS_KEY = os.getenv("LAKEFS_ACCESS_KEY")
SECRET_KEY = os.getenv("LAKEFS_SECRET_KEY")
lakefs_endpoint = os.getenv("LAKEFS_ENDPOINT", "http://lakefs-dev:8000")


In [2]:
#test read parquet
import pandas as pd
storage_options = {
    "key": ACCESS_KEY,
    "secret": SECRET_KEY,
    "client_kwargs": {
        "endpoint_url": lakefs_endpoint
    }
}

repo = "pollution-data"
branch = "main"
path = "pollution.parquet"
lakefs_s3_path = f"s3a://{repo}/{branch}/{path}"

In [5]:
path_partition = 's3a://pollution-data/main/pollution.parquet/'
# path_partition = 's3a://pollution-data/main/pollution.parquet/year=2025/month=5/day=11/hour=10/5127aa0fd46841dcba3ad95008af1c7d-0.parquet'
df = pd.read_parquet(
    path=path_partition,
    storage_options=storage_options,
    engine="pyarrow"
    )
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180522 entries, 0 to 180521
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype                       
---  ------            --------------   -----                       
 0   timestamp         180522 non-null  datetime64[ns]              
 1   minute            180522 non-null  int64                       
 2   localtime         180522 non-null  datetime64[ns, Asia/Bangkok]
 3   province          180522 non-null  string                      
 4   district          180522 non-null  string                      
 5   district_id       180522 non-null  int64                       
 6   lat               180522 non-null  float64                     
 7   lon               180522 non-null  float64                     
 8   main.aqi          180522 non-null  int64                       
 9   components_co     180522 non-null  float64                     
 10  components_no     180522 non-null  float64              

Unnamed: 0,timestamp,minute,localtime,province,district,district_id,lat,lon,main.aqi,components_co,...,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,flow_timestamp,year,month,day,hour
0,2025-05-13 10:00:09.082896,0,2025-05-13 17:00:09.082896+07:00,Krabi,Nuea Khlong,8108,8.0747,99.0036,1,109.01,...,39.42,0.02,1.79,2.08,0.0,2025-05-13 10:00:08.202481,2025,5,13,10
1,2025-05-13 10:00:08.950969,0,2025-05-13 17:00:08.950969+07:00,Krabi,Lam Thap,8107,8.0717,99.2917,1,104.79,...,38.1,0.02,1.73,2.02,0.01,2025-05-13 10:00:08.202481,2025,5,13,10
2,2025-05-13 10:00:08.949743,0,2025-05-13 17:00:08.949743+07:00,Krabi,Plai Phraya,8106,8.5333,98.8628,1,125.19,...,41.21,0.1,2.72,3.11,0.04,2025-05-13 10:00:08.202481,2025,5,13,10
3,2025-05-13 10:00:08.872550,0,2025-05-13 17:00:08.872550+07:00,Krabi,Khao Phanom,8102,8.2647,99.0492,1,115.44,...,42.29,0.02,2.19,2.35,0.0,2025-05-13 10:00:08.202481,2025,5,13,10
4,2025-05-13 10:00:09.088553,0,2025-05-13 17:00:09.088553+07:00,Krabi,Ao Luek,8105,8.3775,98.7217,1,125.97,...,43.89,0.05,3.24,3.51,0.0,2025-05-13 10:00:08.202481,2025,5,13,10


In [None]:
# lakefs_s3_path,
# storage_options=storage_options,
# partition_cols=["year", "month", "day", "hour"],


In [3]:
import s3fs
path = 's3a://pollution-data/main/pollution.parquet'
fs = s3fs.S3FileSystem(
    key=ACCESS_KEY,
    secret=SECRET_KEY,
    client_kwargs={'endpoint_url': lakefs_endpoint}
)

data_list = fs.glob(f"{path}/*/*/*/*/*")
len(data_list)

200

In [6]:
import pyarrow.dataset as ds
path = "pollution-data/main/pollution.parquet/"
dataset = ds.dataset(
    path,
    format="parquet",
    partitioning=["year", "month", "day", "hour"],
    filesystem=fs
)

table = dataset.to_table()
df = table.to_pandas()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180522 entries, 0 to 180521
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype                       
---  ------            --------------   -----                       
 0   timestamp         180522 non-null  datetime64[ns]              
 1   minute            180522 non-null  int64                       
 2   localtime         180522 non-null  datetime64[ns, Asia/Bangkok]
 3   province          180522 non-null  string                      
 4   district          180522 non-null  string                      
 5   district_id       180522 non-null  int64                       
 6   lat               180522 non-null  float64                     
 7   lon               180522 non-null  float64                     
 8   main.aqi          180522 non-null  int64                       
 9   components_co     180522 non-null  float64                     
 10  components_no     180522 non-null  float64              

In [15]:
df_filtered = df["hour"] == 14
df_filtered

# filtered = dataset.to_table(filter=(ds.field("hour") == 14))
# df_filtered = filtered.to_pandas()

0       False
1       False
2       False
3       False
4       False
        ...  
1915    False
1916    False
1917    False
1918    False
1919    False
Name: hour, Length: 1920, dtype: bool

In [None]:
from datetime import datetime

filtered = dataset.to_table(
    filter=(
        (ds.field("year") == 2025) &
        (ds.field("month") == 5) &
        (ds.field("day") == 8) &
        (ds.field("hour") == 14)
    )
)
df_filtered = filtered.to_pandas()


In [9]:
print(df_filtered.head())
print(df_filtered['localtime'].min(), df_filtered['localtime'].max())

NameError: name 'df_filtered' is not defined

In [7]:
def load_data(lakefs_path):
    dataset = ds.dataset(
        lakefs_path,
        format="parquet",
        partitioning=["year", "month", "day", "hour"],
        filesystem=fs
    )
    table = dataset.to_table()
    df = table.to_pandas()
    
    return df

pollution_path = 'pollution-data/main/pollution.parquet'
pollution_df = load_data(pollution_path)
# print('Pollution')
pollution_df.info()
pollution_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2781 entries, 0 to 2780
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype                       
---  ------            --------------  -----                       
 0   timestamp         2781 non-null   datetime64[ns]              
 1   minute            2781 non-null   int64                       
 2   localtime         2781 non-null   datetime64[ns, Asia/Bangkok]
 3   province          2781 non-null   string                      
 4   district          2781 non-null   string                      
 5   district_id       2781 non-null   int64                       
 6   lat               2781 non-null   float64                     
 7   lon               2781 non-null   float64                     
 8   main.aqi          2781 non-null   int64                       
 9   components_co     2781 non-null   float64                     
 10  components_no     2781 non-null   float64                     
 11  comp

Unnamed: 0,timestamp,minute,localtime,province,district,district_id,lat,lon,main.aqi,components_co,...,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,flow_timestamp,year,month,day,hour
0,2025-05-13 04:45:00.643749,45,2025-05-13 11:45:00.643749+07:00,Krabi,Nuea Khlong,8108,8.0747,99.0036,1,104.09,...,37.94,0.03,1.54,1.95,0.01,2025-05-13 04:44:59.753761,year=2025,month=5,day=13,hour=4
1,2025-05-13 04:45:00.780149,45,2025-05-13 11:45:00.780149+07:00,Krabi,Lam Thap,8107,8.0717,99.2917,1,100.87,...,36.42,0.04,1.43,1.81,0.03,2025-05-13 04:44:59.753761,year=2025,month=5,day=13,hour=4
2,2025-05-13 04:45:00.780432,45,2025-05-13 11:45:00.780432+07:00,Krabi,Plai Phraya,8106,8.5333,98.8628,1,113.1,...,37.97,0.12,2.16,2.65,0.04,2025-05-13 04:44:59.753761,year=2025,month=5,day=13,hour=4
3,2025-05-13 04:45:00.612240,45,2025-05-13 11:45:00.612240+07:00,Krabi,Khao Phanom,8102,8.2647,99.0492,1,107.32,...,40.48,0.03,1.81,2.1,0.01,2025-05-13 04:44:59.753761,year=2025,month=5,day=13,hour=4
4,2025-05-13 04:45:00.617396,45,2025-05-13 11:45:00.617396+07:00,Krabi,Ao Luek,8105,8.3775,98.7217,1,119.9,...,42.27,0.08,2.94,3.25,0.01,2025-05-13 04:44:59.753761,year=2025,month=5,day=13,hour=4
