**Checking for data completeness**

In [None]:
import pandas as pd
import os

In [None]:
import pandas as pd
data_availability = {}

In [None]:
a_column = 'pm1_0_atm'
b_column = a_column + '_b'

In [None]:
## Total available
import os
step = '1'

data_availability[step] = {"Description" : "Delete Missing data (Missing value from both channels)"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120
    available_data = df[~df[a_column].isna() | ~df[b_column].isna()].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability


In [None]:
## One channel observation
import os
step = '2'
data_availability[step] = {"Description" : "Delete observation with data from one channel"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120
    available_data = df[~df[a_column].isna() & ~df[b_column].isna()].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability

In [None]:
## Data with abnormal temperature and relative humidity
import os
step = '3'
data_availability[step] = {"Description" : "Delete observation with abnormal temperature and relative humidity (T<-200 or >1000 F, RH >100% or <0%)"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120

    #logics
    pm_logic = ~df[a_column].isna() & ~df[b_column].isna()
    rh_logic = (df['current_humidity'] < 100) & (df['current_humidity'] > 0)
    temp_logic = (df['current_temp_f'] > -200) & (df['current_temp_f'] < 1000)
    validity_logic = pm_logic & rh_logic & temp_logic

    #avialability check
    available_data = df[validity_logic].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability

In [None]:
## Delete data if pm2.5 < 100 um/m3 and delta > 10 um/m3, or if pm2.5 > 100um/m3  and delta > 10%
import os
step = '4a'
data_availability[step] = {"Description" : "Delete data if pm2.5 < 100 um/m3 and delta > 10 um/m3, or if pm2.5 > 100um/m3  and delta > 10%"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120

    #logics
    pm_logic = ~df[a_column].isna() & ~df[b_column].isna()
    rh_logic = (df['current_humidity'] < 100) & (df['current_humidity'] > 0)
    temp_logic = (df['current_temp_f'] > -200) & (df['current_temp_f'] < 1000)
    delta = (df[a_column] - df[b_column]).abs()
    average = (df[a_column] + df[b_column])/2
    cond1 = (delta <= 10) & (average <= 100)
    cond2 = (delta <= 0.1*average) & (average > 100)
    pm_channel_merge_logic = cond1 | cond2
    validity_logic = pm_logic & rh_logic & temp_logic & pm_channel_merge_logic

    #avialability check
    available_data = df[validity_logic].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability

In [None]:
## Delete data delta > 5ug/m3 and delta > 61% a
import os
step = '4b'
data_availability[step] = {"Description" : "Delete data delta > 5ug/m3 and delta > 61% a"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120

    #logics
    pm_logic = ~df[a_column].isna() & ~df[b_column].isna()
    rh_logic = (df['current_humidity'] < 100) & (df['current_humidity'] > 0)
    temp_logic = (df['current_temp_f'] > -200) & (df['current_temp_f'] < 1000)
    delta = (df[a_column] - df[b_column]).abs()
    average = (df[a_column] + df[b_column])/2
    cond1 = (delta <= 10) & (average <= 100)
    cond2 = (delta <= 0.1*average) & (average > 100)
    pm_channel_merge_logic = cond1 | cond2
    pm_channel_merge_logic2 = (delta < 5) | (delta < 0.61 * df[a_column])
    validity_logic = pm_logic & rh_logic & temp_logic & pm_channel_merge_logic & pm_channel_merge_logic2

    #avialability check
    available_data = df[validity_logic].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability

In [None]:
## Delete data more than 500 ug/m3
import os
step = '5'
data_availability[step] = {"Description" : "Delete data with concentration more than 500 ug/m3 and with negative values"}

dir = 'merged'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['BDDateTime'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 120

    #logics
    pm_logic = ~df[a_column].isna() & ~df[b_column].isna()
    rh_logic = (df['current_humidity'] < 100) & (df['current_humidity'] > 0)
    temp_logic = (df['current_temp_f'] > -200) & (df['current_temp_f'] < 1000)
    delta = (df[a_column] - df[b_column]).abs()
    average = (df[a_column] + df[b_column])/2
    cond1 = (delta <= 10) & (average <= 100)
    cond2 = (delta <= 0.1*average) & (average > 100)
    pm_channel_merge_logic = cond1 | cond2
    pm_channel_merge_logic2 = (delta < 5) | (delta < 0.61 * df[a_column])
    data_range_logic = (df[a_column] >= 0) & (df[a_column] <= 500) & (df[b_column] >= 0) & (df[b_column] <= 500)
    validity_logic = pm_logic & rh_logic & temp_logic & pm_channel_merge_logic & pm_channel_merge_logic2 & data_range_logic

    #avialability check
    available_data = df[validity_logic].shape[0]
    availability = (available_data / expected_data) * 100 if expected_data > 0 else 0
    print(f"File: {file}, Data Availability: {availability:.2f}%")
    data_availability[step][f'{file}'] = availability

In [None]:
availability_df = pd.DataFrame.from_dict(data_availability, orient='index')

In [None]:
availability_df

In [None]:
availability_df.to_clipboard()

In [None]:
availability_df.to_csv("availability.csv")

## Table a.2

##### Station wise

In [None]:
hourly_availability = {}

In [None]:
## Delete data delta > 5ug/m3 and delta > 61% a
import os
import numpy as np

dir = 'hourly_averages'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    sensor = f'{file}'
    hourly_availability[sensor] = {}
    file_path = os.path.join(dir, file)
    df = pd.read_csv(file_path, parse_dates=['time'])
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds()
    expected_data = total_sec / 3600

    columns = ['pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm']
    hourly_availability[sensor] = {}
    for column in columns:
        #avialability check
        available_data = df[df[column].notna()].shape[0]
        availability = (available_data / expected_data) * 100 if expected_data > 0 else 0

        pm_mean = df[column].mean()
        pm_sd = df[column].std()
        pm_gm = np.exp(np.mean(np.log(df[column].dropna().to_numpy())))
        a = df[column].dropna().to_numpy()
        pm_iqr = np.percentile(a, 75) - np.percentile(a, 25)

        column_id = column.upper().replace('_ATM', '').replace('_', '.')

        hourly_availability[sensor].update({
            f'{column_id} Number of hours of data' : available_data,
            f'{column_id} mean ± SD' : f"{round(pm_mean, 2)} ± {round(pm_sd, 2)}",
            f'{column_id} GM' : pm_gm,
            f'{column_id} IQR' : pm_iqr,
            f'{column_id} Hourly data completeness' : availability
        })

    rh_mean = df['current_humidity'].mean()
    rh_sd = df['current_humidity'].std()

    temp_mean = df['current_temp_f'].mean()
    temp_sd = df['current_temp_f'].std()
    
    hourly_availability[sensor].update({
            'RH (mean ± sd)' : f"{round(rh_mean, 2)} ± {round(rh_sd, 2)}",
            'Temp (mean ± sd)' : f"{round(temp_mean, 2)} ± {round(temp_sd, 2)}",
    })

    print(f"File: {file}, Data Availability: {availability:.2f}%")

In [None]:
hourly_availability_df = pd.DataFrame.from_dict(hourly_availability, orient='index')

In [None]:
hourly_availability_df

In [None]:
hourly_availability_df.to_clipboard()

In [None]:
hourly_availability_df.to_csv("hourly_availability.csv")

##### zone wise

In [None]:
zone_wise_hourly = {}

In [None]:
## Delete data delta > 5ug/m3 and delta > 61% a
import os
import numpy as np

dir = 'hourly_averages'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

urban = ['U1.csv', 'U2.csv']
suburb = ['SU1.csv', 'SU2.csv']
rural = ['R1.csv', 'R2.csv', 'R3.csv', 'R4.csv']

categories = [
    (rural, "Rural", 1),
    (suburb, "Suburban", 1), 
    (urban, "Urban", 3), 
    ]


for category, cat_name, offset in categories:
    zone_wise_hourly[cat_name] = {}
    dfs = []
    for file in category:
        file_path = os.path.join(dir, file)
        df = pd.read_csv(file_path, parse_dates=['time'])
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    total_sec = (pd.to_datetime("2025-06-30", format = "%Y-%m-%d") - pd.to_datetime("2023-07-01", format = "%Y-%m-%d")).total_seconds() * len(dfs)
    expected_data = total_sec / 3600

    columns = ['pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm']
    for column in columns:
        #avialability check
        available_data = df[df[column].notna()].shape[0]
        availability = (available_data / expected_data) * 100 if expected_data > 0 else 0

        pm_mean = df[column].mean()
        pm_sd = df[column].std()
        pm_gm = np.exp(np.mean(np.log(df[column].dropna().to_numpy())))
        a = df[column].dropna().to_numpy()
        pm_iqr = np.percentile(a, 75) - np.percentile(a, 25)

        column_id = column.upper().replace('_ATM', '').replace('_', '.')

        zone_wise_hourly[cat_name].update({
            f'{column_id} Number of hours of data' : available_data,
            f'{column_id} mean ± SD' : f"{round(pm_mean, 2)} ± {round(pm_sd, 2)}",
            f'{column_id} GM' : pm_gm,
            f'{column_id} IQR' : pm_iqr,
            f'{column_id} Hourly data completeness' : availability
        })

    rh_mean = df['current_humidity'].mean()
    rh_sd = df['current_humidity'].std()

    temp_mean = df['current_temp_f'].mean()
    temp_sd = df['current_temp_f'].std()
    
    zone_wise_hourly[cat_name].update({
            'RH (mean ± sd)' : f"{round(rh_mean, 2)} ± {round(rh_sd, 2)}",
            'Temp (mean ± sd)' : f"{round(temp_mean, 2)} ± {round(temp_sd, 2)}",
    })

    print(f"File: {file}, Data Availability: {availability:.2f}%")

In [None]:
zone_wise_hourly_df = pd.DataFrame.from_dict(zone_wise_hourly, orient='index')
zone_wise_hourly_df

In [None]:
zone_wise_hourly_df.to_clipboard()