In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [29]:
def process_air_quality_data(file_path, location, start_date='2017-01-01', end_date='2023-12-31'):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Strip whitespace from column names
    df.columns = df.columns.str.strip()
    
    # Convert columns to numeric, coercing errors to NaN
    for col in ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Convert the 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d')
    
    # Filter the data between the specified dates
    df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    
    # Extract year and month from the 'date' column
    df_filtered['year'] = df_filtered['date'].dt.year
    df_filtered['month'] = df_filtered['date'].dt.month
    
    # Group by year and month and calculate the mean for each pollutant
    df_monthly_avg = df_filtered.groupby(['year', 'month']).mean().reset_index()

    df_monthly_avg['location'] = location
    
    return df_filtered, df_monthly_avg

# Example usage
# file_path = '../../../data/raw/aman/chulalongkorn-hospital, bangkok-air-quality.csv'
# df_filtered_hospital, df_monthly_avg = process_air_quality_data(file_path)

In [30]:
file_path = '../../../data/raw/ishika/thonburi-power-sub-station.csv'
df_filtered_power, df_monthly_avg_power = process_air_quality_data(file_path, location='thonburi-power')

In [31]:
file_path = '../../../data/raw/aman/city-hall, samut prakan-air-quality.csv'
df_filtered_prakan, df_monthly_avg_prakan = process_air_quality_data(file_path, location='samut-prakan')

In [32]:
file_path = '../../../data/raw/aman/chulalongkorn-hospital, bangkok-air-quality.csv'
df_filtered_hospital, df_monthly_avg_hospital = process_air_quality_data(file_path, location='chulalongkorn-hospital')

In [33]:
file_path = '../../../data/raw/dataset-bids/highway-district, samut sakhon-air-quality.csv'
df_filtered_sakhon, df_monthly_avg_sakhon = process_air_quality_data(file_path, location='samut-sakhon')

In [36]:
final_df = pd.concat([df_monthly_avg_power, df_monthly_avg_prakan, df_monthly_avg_hospital, df_monthly_avg_sakhon])
final_df.to_csv('../../../data/processed/air/air_quality.csv')