In [31]:
import pandas as pd

In [32]:
df_aqi_prakan = pd.read_csv('../../../../data/raw/dataset-bids/south-bangkok power plant, samut prakan-air-quality.csv', parse_dates=['date'])

In [33]:
df_aqi_prakan

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co
0,2024-10-01,38,11,,,,
1,2024-10-02,28,13,,,,
2,2024-10-03,27,19,,,,
3,2024-10-04,46,32,,,,
4,2024-10-05,66,24,,,,
...,...,...,...,...,...,...,...
2562,2016-10-24,,,,,,
2563,2016-11-06,,,,,,
2564,2015-02-14,,,,,,5
2565,2014-10-23,,,,,,6


In [34]:
years = range(2014, 2024)
months = [str(i).zfill(2) for i in range(1, 13)]

In [35]:
cols = ['Unnamed: 0', 'temp_avg', 'dew_avg', 'hum_avg', 'wind_speed_avg', 'pressure_avg', 'precip', 'month', 'year']

In [36]:
# samut prakan
for year in years:
    for month in months:
            filename = f"datasets/samut-prakan/a_{year}-{month}_weather.csv"
            try:
                # for every month of the year, i want to merge the data into one file
                # so 2014 should have one file with 12 months of data, etc...
                df = pd.read_csv(filename)

                # add column names
                df.columns = cols
                
                # add day column that starts like 01, 02, 03, etc...
                # day should start from 01
                df['day'] = df.index + 1

                df.to_csv(f"datasets/samut-prakan/a_{year}-weather.csv", mode='a', header=None)
            except FileNotFoundError:
                print(f"File {filename} not found")
                continue

In [56]:
cols_2 = ['Unnamed: 0', 'Unnamed: 1', 'temp_avg', 'dew_avg', 'hum_avg', 'wind_speed_avg', 'pressure_avg', 'precip', 'month', 'year', 'day']

In [57]:
def merge_files(year):
    df = pd.read_csv(f"datasets/samut-prakan/a_{year}-weather.csv", header=None)
    df.columns = cols_2
    df.drop(columns=["Unnamed: 0", "Unnamed: 1"], inplace=True)
    return df

In [39]:
prakan_2014 = merge_files(2014)
prakan_2015 = merge_files(2015)
prakan_2016 = merge_files(2016)
prakan_2017 = merge_files(2017)
prakan_2018 = merge_files(2018)
prakan_2019 = merge_files(2019)
prakan_2020 = merge_files(2020)
prakan_2021 = merge_files(2021)
prakan_2022 = merge_files(2022)
prakan_2023 = merge_files(2023)

In [40]:
prakan_2014.head(2)

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,1,2014,1
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2


In [41]:
# merge all the dataframes
prakan_skhon = pd.concat([prakan_2014, prakan_2015, prakan_2016, prakan_2017, prakan_2018, prakan_2019, prakan_2020, prakan_2021, prakan_2022, prakan_2023])
prakan_skhon['location'] = 'south-bangkok power plant, samut prakan-air-quality'
prakan_skhon

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day,location
0,23.555556,14.388889,57.6,8.851370,1012.53061,0.0,1,2014,1,"south-bangkok power plant, samut prakan-air-qu..."
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2,"south-bangkok power plant, samut prakan-air-qu..."
2,25.722222,15.555556,57.3,6.437360,1012.53061,0.0,1,2014,3,"south-bangkok power plant, samut prakan-air-qu..."
3,25.944444,17.388889,63.3,5.632690,1012.53061,0.0,1,2014,4,"south-bangkok power plant, samut prakan-air-qu..."
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,1,2014,5,"south-bangkok power plant, samut prakan-air-qu..."
...,...,...,...,...,...,...,...,...,...,...
725,26.611111,17.444444,57.7,9.334172,1015.91700,0.0,12,2023,27,"south-bangkok power plant, samut prakan-air-qu..."
726,27.944444,18.166667,56.2,9.334172,1015.91700,0.0,12,2023,28,"south-bangkok power plant, samut prakan-air-qu..."
727,28.888889,18.944444,56.1,8.851370,1012.53061,0.0,12,2023,29,"south-bangkok power plant, samut prakan-air-qu..."
728,29.333333,19.833333,57.2,7.402964,1012.53061,0.0,12,2023,30,"south-bangkok power plant, samut prakan-air-qu..."


In [42]:
prakan_skhon.to_csv("datasets/samut-prakan-processed/samut-prakan-weather.csv", index=False)

In [43]:
df_prakan_processed = pd.read_csv("datasets/samut-prakan-processed/samut-prakan-weather.csv")
df_prakan_processed.head(5)

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day,location
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,1,2014,1,"south-bangkok power plant, samut prakan-air-qu..."
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2,"south-bangkok power plant, samut prakan-air-qu..."
2,25.722222,15.555556,57.3,6.43736,1012.53061,0.0,1,2014,3,"south-bangkok power plant, samut prakan-air-qu..."
3,25.944444,17.388889,63.3,5.63269,1012.53061,0.0,1,2014,4,"south-bangkok power plant, samut prakan-air-qu..."
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,1,2014,5,"south-bangkok power plant, samut prakan-air-qu..."


In [44]:
# merge month year and day to date
df_prakan_processed['date'] = pd.to_datetime(df_prakan_processed[['year', 'month', 'day']])
df_prakan_processed.drop(columns=['year', 'month', 'day'], inplace=True)
df_prakan_processed.head(5)

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,location,date
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,"south-bangkok power plant, samut prakan-air-qu...",2014-01-01
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,"south-bangkok power plant, samut prakan-air-qu...",2014-01-02
2,25.722222,15.555556,57.3,6.43736,1012.53061,0.0,"south-bangkok power plant, samut prakan-air-qu...",2014-01-03
3,25.944444,17.388889,63.3,5.63269,1012.53061,0.0,"south-bangkok power plant, samut prakan-air-qu...",2014-01-04
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,"south-bangkok power plant, samut prakan-air-qu...",2014-01-05


In [50]:
# merge df_aqi_prakan with df_prakan_processed
df_prakan = df_aqi_prakan.merge(df_prakan_processed, on='date', how='left')

In [51]:
df_prakan.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,location
0,2024-10-01,38,11,,,,,,,,,,,
1,2024-10-02,28,13,,,,,,,,,,,
2,2024-10-03,27,19,,,,,,,,,,,
3,2024-10-04,46,32,,,,,,,,,,,
4,2024-10-05,66,24,,,,,,,,,,,


In [61]:
# --------------------------------------------

In [None]:
df_aqi_sakhon = pd.read_csv('../../../../data/raw/dataset-bids/highway-district, samut sakhon-air-quality.csv', parse_dates=['date'])

In [55]:
# samut sakhon
cols = ['Unnamed: 0', 'temp_avg', 'dew_avg', 'hum_avg', 'wind_speed_avg', 'pressure_avg', 'precip', 'month', 'year']

for year in years:
    for month in months:
            filename = f"datasets/samut-sakhon/a_{year}-{month}_weather.csv"
            try:
                # for every month of the year, i want to merge the data into one file
                # so 2014 should have one file with 12 months of data, etc...
                df = pd.read_csv(filename)

                # add column names
                df.columns = cols
                
                # add day column that starts like 01, 02, 03, etc...
                # day should start from 01
                df['day'] = df.index + 1

                df.to_csv(f"datasets/samut-sakhon/a_{year}-weather.csv", mode='a', header=None)
            except FileNotFoundError:
                print(f"File {filename} not found")
                continue

In [58]:
def merge_files(year):
    df = pd.read_csv(f"datasets/samut-sakhon/a_{year}-weather.csv", header=None)
    df.columns = cols_2
    df.drop(columns=["Unnamed: 0", "Unnamed: 1"], inplace=True)
    return df

In [59]:
sakhon_2014 = merge_files(2014)
sakhon_2015 = merge_files(2015)
sakhon_2016 = merge_files(2016)
sakhon_2017 = merge_files(2017)
sakhon_2018 = merge_files(2018)
sakhon_2019 = merge_files(2019)
sakhon_2020 = merge_files(2020)
sakhon_2021 = merge_files(2021)
sakhon_2022 = merge_files(2022)
sakhon_2023 = merge_files(2023)

In [60]:
sakhon_2014.head()

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,1,2014,1
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2
2,25.722222,15.555556,57.3,6.43736,1012.53061,0.0,1,2014,3
3,25.944444,17.388889,63.3,5.63269,1012.53061,0.0,1,2014,4
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,1,2014,5


In [65]:
# merge all the dataframes
samut_skhon = pd.concat([sakhon_2014, sakhon_2015, sakhon_2016, sakhon_2017, sakhon_2018, sakhon_2019, sakhon_2020, sakhon_2021, sakhon_2022, sakhon_2023])
samut_skhon['location'] = 'highway-district, samut sakhon'
samut_skhon

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day,location
0,23.555556,14.388889,57.6,8.851370,1012.53061,0.0,1,2014,1,"highway-district, samut sakhon"
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2,"highway-district, samut sakhon"
2,25.722222,15.555556,57.3,6.437360,1012.53061,0.0,1,2014,3,"highway-district, samut sakhon"
3,25.944444,17.388889,63.3,5.632690,1012.53061,0.0,1,2014,4,"highway-district, samut sakhon"
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,1,2014,5,"highway-district, samut sakhon"
...,...,...,...,...,...,...,...,...,...,...
360,26.611111,17.444444,57.7,9.334172,1015.91700,0.0,12,2023,27,"highway-district, samut sakhon"
361,27.944444,18.166667,56.2,9.334172,1015.91700,0.0,12,2023,28,"highway-district, samut sakhon"
362,28.888889,18.944444,56.1,8.851370,1012.53061,0.0,12,2023,29,"highway-district, samut sakhon"
363,29.333333,19.833333,57.2,7.402964,1012.53061,0.0,12,2023,30,"highway-district, samut sakhon"


In [66]:
samut_skhon.to_csv("datasets/samut-sakhon-processed/samut-sakhon-weather.csv", index=False)

In [67]:
df_samut_processed = pd.read_csv("datasets/samut-sakhon-processed/samut-sakhon-weather.csv")
df_samut_processed.head(5)

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,month,year,day,location
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,1,2014,1,"highway-district, samut sakhon"
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,1,2014,2,"highway-district, samut sakhon"
2,25.722222,15.555556,57.3,6.43736,1012.53061,0.0,1,2014,3,"highway-district, samut sakhon"
3,25.944444,17.388889,63.3,5.63269,1012.53061,0.0,1,2014,4,"highway-district, samut sakhon"
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,1,2014,5,"highway-district, samut sakhon"


In [69]:
# merge month year and day to date
df_samut_processed['date'] = pd.to_datetime(df_samut_processed[['year', 'month', 'day']])
df_samut_processed.drop(columns=['year', 'month', 'day'], inplace=True)
df_samut_processed.head(5)

Unnamed: 0,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,location,date
0,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,"highway-district, samut sakhon",2014-01-01
1,24.111111,14.555556,57.5,8.207634,1012.53061,0.0,"highway-district, samut sakhon",2014-01-02
2,25.722222,15.555556,57.3,6.43736,1012.53061,0.0,"highway-district, samut sakhon",2014-01-03
3,25.944444,17.388889,63.3,5.63269,1012.53061,0.0,"highway-district, samut sakhon",2014-01-04
4,26.277778,18.111111,62.8,7.885766,1012.53061,0.0,"highway-district, samut sakhon",2014-01-05


In [70]:
df_sakhon = df_aqi_sakhon.merge(df_samut_processed, on='date', how='left')
df_sakhon.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,location
0,2024-10-01,48,,,,,,,,,,,,
1,2024-10-02,38,,,,,,,,,,,,
2,2024-10-03,37,,,,,,,,,,,,
3,2024-10-04,46,,,,,,,,,,,,
4,2024-10-05,77,,,,,,,,,,,,


In [71]:
df_sakhon[df_aqi_sakhon['date'] == '2014-01-01']

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,temp_avg,dew_avg,hum_avg,wind_speed_avg,pressure_avg,precip,location
3005,2014-01-01,,706,78,22,14,10,23.555556,14.388889,57.6,8.85137,1012.53061,0.0,"highway-district, samut sakhon"


In [72]:
# ---------------------------------------------------

Sachin