#### This marks the beginning of generating a dataset sampled every minute.

In [1]:
import os
import pandas as pd
from dateutil import parser

#### Find all the CSV files with the same name collected from each date.


| **File Name**             | **Description**                                                                 |
|---------------------------|---------------------------------------------------------------------------------|
| **dataset.csv**            | All sensor data collected from PMS5003 Particulate Matter and Sensirion SCD30 Sensors |
| **no2dataset.csv**         | NO2 sensor data collected from ULPSM-NO2 968-047                                |
| **pm_aeroequal_data.csv**  | PM2.5 and PM10 sensor data collected from Aeroqual S500 Portable Air Quality Monitor |
| **no2_aeroequal_data.csv** | NO2 sensor data collected from Aeroqual S500 Portable Air Quality Monitor        |
| **iAnemometer.xls**        | Windspeed and Temperature collected from BTMETER BT-100 Anemometer               |


In [2]:
def find_csv_files(directory):
    pm_sensor_files = []
    no2_sensor_files = []
    pm_aeroequal_files = []
    no2_aeroequal_files = []
    anemometer_files = []
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.startswith("dataset.csv"):
                pm_sensor_files.append(os.path.join(root, file).replace("\\","/"))

            if file.startswith("no2dataset.csv"):
                no2_sensor_files.append(os.path.join(root, file).replace("\\","/"))

            if file.startswith("pm_aeroequal_data.csv"):
                pm_aeroequal_files.append(os.path.join(root, file).replace("\\","/"))

            if file.startswith("no2_aeroequal_data.csv"):
                no2_aeroequal_files.append(os.path.join(root, file).replace("\\","/"))

            if file.startswith("iAnemometer.xls"):
                anemometer_files.append(os.path.join(root, file).replace("\\","/"))

    return pm_sensor_files, no2_sensor_files, pm_aeroequal_files, no2_aeroequal_files, anemometer_files

##### 1. Sort the files in increasing order of date
##### 2. The Anemometer files are in Excel, while the others are in CSV.
##### 3. The logic merges a particular sensor file for all dates into one master file.

In [3]:
def create_master_table(files):

    files.sort(key=lambda x: int(x.split('/')[-2].split('_')[1]))
    
    data_frames = []

    for f in files:
        
        if f.endswith('.csv'):
            df = pd.read_csv(f)
            df.columns = df.columns.str.strip()
            if 'Date Time' in df.columns:
                df[['Date', 'Time']] = df['Date Time'].apply(lambda x: pd.Series([parser.parse(x).strftime('%d-%m-%Y')
                                                                                  , parser.parse(x).strftime('%H:%M:%S')]))
                df.drop(columns=['Date Time'], inplace=True)
            else: 
                df['Date'] = df['Date'].apply(lambda x: parser.parse(x.replace(":","/")).strftime('%d-%m-%Y'))

        elif f.endswith('.xls'):
            df = pd.read_excel(f, header=1)
            date_str = f.split('/')[-2].replace('_', '-')
            df['Date'] = parser.parse(date_str).strftime('%d-%m-%Y')
            df.drop(columns=['ID'], inplace=True, errors='ignore')

        df['Time'] = df['Time'].apply(lambda x: parser.parse(x).strftime('%H:%M:%S'))
        df = df[['Date', 'Time'] + [col for col in df.columns if col not in ['Date', 'Time']]]
        data_frames.append(df)

    return pd.concat(data_frames, ignore_index=True)

In [4]:
directory_path = 'E:/Final Dissertation/'
pm_sensor_files, no2_sensor_files, pm_aeroequal_files, no2_aeroequal_files, anemometer_files  = find_csv_files(directory_path)

In [5]:
for item in pm_sensor_files:
    print("PM Sensor File:", item)

PM Sensor File: E:/Final Dissertation/Collected_Data/August_15_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_16_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_18_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_19_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_20_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_21_2024/dataset.csv
PM Sensor File: E:/Final Dissertation/Collected_Data/August_22_2024/dataset.csv


In [6]:
for item in no2_sensor_files:
    print("NO2 Sensor File:", item)

NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_15_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_16_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_18_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_19_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_20_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_21_2024/no2dataset.csv
NO2 Sensor File: E:/Final Dissertation/Collected_Data/August_22_2024/no2dataset.csv


In [7]:
for item in pm_aeroequal_files:
    print("PM Aeroequal File:", item)

PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_15_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_16_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_18_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_19_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_20_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_21_2024/pm_aeroequal_data.csv
PM Aeroequal File: E:/Final Dissertation/Collected_Data/August_22_2024/pm_aeroequal_data.csv


In [8]:
for item in no2_aeroequal_files:
    print("NO2 Aeroequal File:", item)

NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_15_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_16_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_18_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_19_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_20_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_21_2024/no2_aeroequal_data.csv
NO2 Aeroequal File: E:/Final Dissertation/Collected_Data/August_22_2024/no2_aeroequal_data.csv


In [9]:
for item in anemometer_files:
    print("Anemometer File:", item)

Anemometer File: E:/Final Dissertation/Collected_Data/August_15_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_16_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_18_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_19_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_20_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_21_2024/iAnemometer.xls
Anemometer File: E:/Final Dissertation/Collected_Data/August_22_2024/iAnemometer.xls


In [10]:
pm_sensor_master_table = create_master_table(pm_sensor_files)
no2_sensor_master_table = create_master_table(no2_sensor_files)
pm_aeroequal_master_table = create_master_table(pm_aeroequal_files)
no2_aeroequal_master_table = create_master_table(no2_aeroequal_files)
anemometer_master_table = create_master_table(anemometer_files)

In [11]:
pm_sensor_master_table

Unnamed: 0,Date,Time,PM1.0 ug/m3 (ultrafine particles),"PM2.5 ug/m3 (combustion particles, organic compounds, metals)","PM10 ug/m3 (dust, pollen, mould spores)",PM1.0 ug/m3 (atmos env),PM2.5 ug/m3 (atmos env),PM10 ug/m3 (atmos env),>0.3um in 0.1L air,>0.5um in 0.1L air,>1.0um in 0.1L air,>2.5um in 0.1L air,>5.0um in 0.1L air,>10um in 0.1L air,CO2 in ppm,Temperature in C,Relative Humidity
0,15-08-2024,14:38:00,3,4,4,3,4,4,516,167,30,0,0,0,544.83,30.98,48.65
1,15-08-2024,14:39:00,3,4,4,3,4,4,546,174,34,0,0,0,511.20,30.94,49.05
2,15-08-2024,14:40:00,3,4,4,3,4,4,567,179,36,0,0,0,577.91,30.77,49.73
3,15-08-2024,14:41:00,3,4,4,3,4,4,597,184,40,0,0,0,477.55,28.52,53.64
4,15-08-2024,14:42:00,3,4,6,3,4,6,597,184,40,2,2,2,487.44,27.68,55.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,22-08-2024,19:11:09,4,11,13,4,11,13,975,281,67,20,2,2,406.02,24.50,64.77
2963,22-08-2024,19:12:09,4,12,13,4,12,13,1017,290,79,18,2,2,405.49,24.36,65.31
2964,22-08-2024,19:13:09,3,11,12,3,11,12,1017,290,79,18,2,2,483.23,24.47,65.23
2965,22-08-2024,19:14:09,3,11,12,3,11,12,1017,290,79,18,2,2,478.44,24.44,65.13


In [12]:
no2_sensor_master_table

Unnamed: 0,Date,Time,NO2 ppb,NO2 µg/m³
0,15-08-2024,14:33:05,0.216462,0.413983
1,15-08-2024,14:34:11,-0.239692,-0.458411
2,15-08-2024,14:35:16,0.071096,0.135970
3,15-08-2024,14:36:21,0.972681,1.860253
4,15-08-2024,14:37:27,0.096102,0.183795
...,...,...,...,...
2631,22-08-2024,19:09:58,1.146366,2.192424
2632,22-08-2024,19:11:03,1.287635,2.462603
2633,22-08-2024,19:12:09,1.029305,1.968546
2634,22-08-2024,19:13:14,1.104797,2.112924


In [13]:
pm_aeroequal_master_table

Unnamed: 0,Date,Time,Monitor ID,Location ID,PM10(ppm),PM2.5(ppm)
0,15-08-2024,14:05:00,1,1,0.001,0.001
1,15-08-2024,14:06:00,1,1,0.001,0.001
2,15-08-2024,14:07:00,1,1,0.000,0.000
3,15-08-2024,14:08:00,1,1,0.001,0.001
4,15-08-2024,14:09:00,1,1,0.001,0.001
...,...,...,...,...,...,...
3042,22-08-2024,19:12:00,1,1,0.000,0.000
3043,22-08-2024,19:13:00,1,1,0.004,0.001
3044,22-08-2024,19:14:00,1,1,0.002,0.001
3045,22-08-2024,19:15:00,1,1,0.002,0.001


In [14]:
no2_aeroequal_master_table

Unnamed: 0,Date,Time,Monitor ID,Location ID,NO2(ppm)
0,15-08-2024,14:05:00,2,1,0.000
1,15-08-2024,14:06:00,2,1,0.000
2,15-08-2024,14:07:00,2,1,0.000
3,15-08-2024,14:08:00,2,1,0.000
4,15-08-2024,14:09:00,2,1,0.000
...,...,...,...,...,...
3028,22-08-2024,19:12:00,2,1,0.004
3029,22-08-2024,19:13:00,2,1,0.006
3030,22-08-2024,19:14:00,2,1,0.005
3031,22-08-2024,19:15:00,2,1,0.006


In [15]:
anemometer_master_table

Unnamed: 0,Date,Time,Wind Value(km/h),Temp Value(°C)
0,15-08-2024,15:13:29,0.0,25.6
1,15-08-2024,15:14:29,0.0,26.0
2,15-08-2024,15:15:29,0.0,25.3
3,15-08-2024,15:16:29,0.0,25.4
4,15-08-2024,15:17:29,1.8,25.4
...,...,...,...,...
2718,22-08-2024,19:12:07,0.3,21.9
2719,22-08-2024,19:13:07,0.0,21.9
2720,22-08-2024,19:14:07,0.0,21.9
2721,22-08-2024,19:15:07,0.0,21.8


#### Saved as Master files in output directory in Master_Data.

In [16]:
master_location = "E:/Final Dissertation/Master_Data/"

pm_sensor_master_table.to_csv(master_location + "pm_sensor_master_table.csv", index=False)
no2_sensor_master_table.to_csv(master_location + "no2_sensor_master_table.csv", index=False)
pm_aeroequal_master_table.to_csv(master_location + "pm_aeroequal_master_table.csv", index=False)
no2_aeroequal_master_table.to_csv(master_location + "no2_aeroequal_master_table.csv", index=False)
anemometer_master_table.to_csv(master_location + "anemometer_master_table.csv", index=False)

#### To easily identify whether data was collected by sensor, aeroequal or council as well as reorder columns

In [17]:
column_mapping = {
    'PM2.5 ug/m3 (combustion particles, organic compounds, metals)': 'PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor',
    'PM2.5(ppm)': 'PM2.5_mg/m³_aeroequal',
    'PM25': 'PM2.5_µg/m³_council',
    'PM10 ug/m3  (dust, pollen, mould spores)': 'PM10_µg/m³_dust_pollen_mould_spores_sensor',
    'PM10(ppm)': 'PM10_mg/m³_aeroequal',
    'PM10': 'PM10_µg/m³_council',
    'NO2 ppb': 'NO2_ppb_sensor',
    'NO2(ppm)': 'NO2_ppm_aeroequal',
    'NO2': 'NO2_µg/m³_council',
    'Wind Value(km/h)': 'Wind_value_km/h',
    'PM1.0 ug/m3 (ultrafine particles)': 'PM1.0_µg/m³_ultrafine_particles_sensor',
    'PM1.0 ug/m3 (atmos env)': 'PM1.0_µg/m³_atmos_env',
    'PM2.5 ug/m3 (atmos env)': 'PM2.5_µg/m³_atmos_env',
    'PM10 ug/m3 (atmos env)': 'PM10_µg/m³_atmos_env',
    '>0.3um in 0.1L air': 'GT_0.3um_in_0.1l_air',
    '>0.5um in 0.1L air': 'GT_0.5um_in_0.1l_air',
    '>1.0um in 0.1L air': 'GT_1.0um_in_0.1l_air',
    '>2.5um in 0.1L air': 'GT_2.5um_in_0.1l_air',
    '>5.0um in 0.1L air': 'GT_5.0um_in_0.1l_air',
    '>10um in 0.1L air': 'GT_10um_in_0.1l_air',
    'CO2 in ppm': 'CO2_in_ppm',
    'Temperature in C': 'Temperature_in_c',
    'Relative Humidity': 'Relative_humidity',
    'NO2 µg/m³': 'NO2_µg/m³_sensor',
    'Temp Value(°C)': 'Temp_value_°c',
    'NO': 'NO_µg/m³_council',
    'NOXasNO2': 'NOx_as_NO2_µg/m³_council',
    'NV10': 'NV10_µg/m³_council',
    'NV25': 'NV2.5_µg/m³_council',
    'V10': 'V10_µg/m³_council',
    'V25': 'V2.5_µg/m³_council'
}

final_columns = ['Date', 'Time'] + list(column_mapping.values())

In [18]:
def load_and_preprocess(file):
    df = pd.read_csv(file).rename(columns=str.strip)
    df['DateTime'] = pd.to_datetime(df.pop('Date') + ' ' + df.pop('Time'), format='%d-%m-%Y %H:%M:%S')
    return df.sort_values('DateTime')

#### The master files are merged with a tolerance of 29 seconds, choosing the nearest available value
#### The DateTime is selected from the file with the second lowest number of records.

In [19]:
def perform_merge_sensor_aeroequal(tolerance_seconds, master_files):
    dfs = [load_and_preprocess(file) for file in master_files]
    tolerance = pd.Timedelta(seconds=tolerance_seconds)
    
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge_asof(merged_df, df, on='DateTime', tolerance=tolerance, direction='nearest')
    
    merged_df.drop(columns=[col for col in merged_df if 'Location ID' in col or 'Monitor ID' in col], inplace=True)
    
    merged_df['Date'] = merged_df['DateTime'].dt.date
    merged_df['Time'] = merged_df['DateTime'].dt.time
    merged_df.drop(columns=['DateTime'], inplace=True)
    
    primary_columns = [
        'Date', 'Time',
        'PM2.5 ug/m3 (combustion particles, organic compounds, metals)',  
        'PM2.5(ppm)',  
        'PM10 ug/m3  (dust, pollen, mould spores)',  
        'PM10(ppm)',  
        'NO2 ppb',  
        'NO2(ppm)',
        'Wind Value(km/h)'
    ]
    
    final_df = merged_df[primary_columns + [col for col in merged_df if col not in primary_columns]]
    final_df = final_df.apply(lambda col: col.interpolate(method='linear') if col.dtype.kind in 'biufc' else col)
    final_df.rename(columns=column_mapping, inplace=True)
    
    return final_df

In [20]:
directory_path = 'E:/Final Dissertation/Master_Data/'
master_files = [os.path.join(directory_path, file) for file in [
    'anemometer_master_table.csv',
    'no2_aeroequal_master_table.csv',
    'no2_sensor_master_table.csv',
    'pm_aeroequal_master_table.csv',
    'pm_sensor_master_table.csv'
]]

#### Why tolerance = 29?
#### 1. Tolerance of less than 29 generated NaN cells with 2723 records, meaning the nearest value was unavailable.
#### 2. Tolerance greater than 29 had no impact.

In [21]:
tolerance = 29
final_df = perform_merge_sensor_aeroequal(tolerance, master_files)
final_df.to_csv('E:/Final Dissertation/Final_Dataset/Final_Dataset.csv', index=False)

#### This marks the end of generating a dataset sampled every minute.
#### Although not utilized for the current problem statement, this Dataset will be helpful for future research.

#### This marks the beginning of generating a dataset sampled every hour, matching the council dataset.
#### The Date Time column from Council is taken as reference, using the same logic as above but with the tolerance of 1 hour

In [22]:
council_df = pd.read_csv('E:/Final Dissertation/Final_Dataset/2024-08-14-240826181306.csv')
council_df['End Date'] = pd.to_datetime(council_df['End Date'].ffill(), format='%d-%m-%Y', errors='coerce')
council_df['End Time'] = council_df['End Time'].replace('24:00:00', '00:00:00')
council_df.loc[council_df['End Time'] == '00:00:00', 'End Date'] += pd.DateOffset(days=1)
council_df['DateTime'] = pd.to_datetime(council_df['End Date'].dt.strftime('%d-%m-%Y') + ' ' + council_df['End Time'], format='%d-%m-%Y %H:%M:%S')
numeric_cols = council_df.select_dtypes(include='number').columns
council_df = council_df.set_index('DateTime')[numeric_cols].resample('h').mean().reset_index()

In [23]:
def preprocess_master_df(file):
    df = pd.read_csv(file)
    df['DateTime'] = pd.to_datetime(df.pop('Date') + ' ' + df.pop('Time'), format='%d-%m-%Y %H:%M:%S')
    return df.set_index('DateTime').resample('h').mean().reset_index()

In [24]:
def perform_merge_sensor_aeroequal_council():

    anemometer_df = preprocess_master_df('E:/Final Dissertation/Master_Data/anemometer_master_table.csv')
    no2_aeroequal_df = preprocess_master_df('E:/Final Dissertation/Master_Data/no2_aeroequal_master_table.csv')
    no2_sensor_df = preprocess_master_df('E:/Final Dissertation/Master_Data/no2_sensor_master_table.csv')
    pm_aeroequal_df = preprocess_master_df('E:/Final Dissertation/Master_Data/pm_aeroequal_master_table.csv')
    pm_sensor_df = preprocess_master_df('E:/Final Dissertation/Master_Data/pm_sensor_master_table.csv')
    
    merged_df = anemometer_df
    for df in [no2_aeroequal_df, no2_sensor_df, pm_aeroequal_df, pm_sensor_df, council_df]:
        merged_df = pd.merge_asof(merged_df, df, on='DateTime', tolerance=pd.Timedelta(hours=1), direction='nearest')
    
    merged_df.rename(columns=column_mapping, inplace=True)
    merged_df = merged_df.dropna(subset=[col for col in merged_df.columns if '_council' not in col], how='all')
    
    merged_df['Date'] = merged_df['DateTime'].dt.date
    merged_df['Time'] = merged_df['DateTime'].dt.time
    merged_df.drop(columns=['DateTime'], inplace=True)

    return merged_df[final_columns].dropna()

In [25]:
final_df = perform_merge_sensor_aeroequal_council()
final_df.to_csv('E:/Final Dissertation/Final_Dataset/Final_Hourly_Dataset.csv', index=False)

In [26]:
expected_columns = list(column_mapping.values())

In [27]:
def generate_statistics(final_df, master_files):
    
    for col in expected_columns:
        if col not in final_df.columns:
            final_df[col] = pd.NA
    
    present_columns = [col for col in expected_columns if col in final_df.columns]
    
    final_stats = final_df[present_columns].describe().T[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

    master_stats_combined = pd.DataFrame()
    for file_path in master_files:
        df = pd.read_csv(file_path)
        df.columns = [col.strip() for col in df.columns]
        
        df.rename(columns=column_mapping, inplace=True)
        
        common_columns = [col for col in present_columns if col in df.columns]
        if common_columns:
            stats = df[common_columns].describe().T[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
            master_stats_combined = pd.concat([master_stats_combined, stats])

    master_stats_combined = master_stats_combined.reindex(present_columns)
    
    return final_stats, master_stats_combined

In [28]:
directory_path = 'E:/Final Dissertation/Master_Data/'

master_files = [os.path.join(directory_path, file) for file in [
    'anemometer_master_table.csv',
    'no2_aeroequal_master_table.csv',
    'no2_sensor_master_table.csv',
    'pm_aeroequal_master_table.csv',
    'pm_sensor_master_table.csv'
]]

final_df = pd.read_csv('E:/Final Dissertation/Final_Dataset/Final_Dataset.csv')
final_hourly_df = pd.read_csv('E:/Final Dissertation/Final_Dataset/Final_Hourly_Dataset.csv')
final_stats, master_stats_combined = generate_statistics(final_df, master_files)
final_hourly_stats, _ = generate_statistics(final_hourly_df, master_files)

#### This marks the end of generating a dataset sampled every hour.

##### 1. Statistics to see how the merged files are different from the master files combined.
##### 2. NaN, in this case, is not available in the file
##### 3. Council columns are not present in the master file and Final Dataset sampled every minute
##### 4. Some Sensor Columns are not present in the council file

In [29]:
comparison_count = pd.DataFrame({
    'Final Dataset Count': final_stats['count'],
    'Final Hourly Dataset Count': final_hourly_stats['count'],
    'Master Files Combined Count': master_stats_combined['count']
})
comparison_count.reindex(expected_columns)

Unnamed: 0,Final Dataset Count,Final Hourly Dataset Count,Master Files Combined Count
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,2723.0,55.0,2967.0
PM2.5_mg/m³_aeroequal,2723.0,55.0,3047.0
PM2.5_µg/m³_council,,55.0,
PM10_µg/m³_dust_pollen_mould_spores_sensor,2723.0,55.0,2967.0
PM10_mg/m³_aeroequal,2723.0,55.0,3047.0
PM10_µg/m³_council,,55.0,
NO2_ppb_sensor,2723.0,55.0,2636.0
NO2_ppm_aeroequal,2723.0,55.0,3033.0
NO2_µg/m³_council,,55.0,
Wind_value_km/h,2723.0,55.0,2723.0


In [30]:
comparison_mean = pd.DataFrame({
    'Final Dataset Mean': final_stats['mean'],
    'Final Hourly Dataset Mean': final_hourly_stats['mean'],
    'Master Files Combined Mean': master_stats_combined['mean']
})
comparison_mean.reindex(expected_columns)

Unnamed: 0,Final Dataset Mean,Final Hourly Dataset Mean,Master Files Combined Mean
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,13.278186,12.738731,12.520728
PM2.5_mg/m³_aeroequal,0.001366,0.001292,0.001271
PM2.5_µg/m³_council,,8.789091,
PM10_µg/m³_dust_pollen_mould_spores_sensor,15.506243,14.935854,14.752949
PM10_mg/m³_aeroequal,0.003429,0.003314,0.003208
PM10_µg/m³_council,,17.118182,
NO2_ppb_sensor,-0.400421,-0.48583,-0.538546
NO2_ppm_aeroequal,0.001492,0.00195,0.001386
NO2_µg/m³_council,,14.023581,
Wind_value_km/h,0.269152,0.275688,0.269152


In [31]:
comparison_std = pd.DataFrame({
    'Final Dataset Std Dev': final_stats['std'],
    'Final Hourly Dataset Std Dev': final_hourly_stats['std'],
    'Master Files Combined Std Dev': master_stats_combined['std']
}).reindex(expected_columns)
comparison_std.reindex(expected_columns)

Unnamed: 0,Final Dataset Std Dev,Final Hourly Dataset Std Dev,Master Files Combined Std Dev
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,17.3755,16.939382,16.736268
PM2.5_mg/m³_aeroequal,0.001564,0.001386,0.001514
PM2.5_µg/m³_council,,7.65145,
PM10_µg/m³_dust_pollen_mould_spores_sensor,19.040585,18.538678,18.295856
PM10_mg/m³_aeroequal,0.005312,0.002926,0.005124
PM10_µg/m³_council,,9.930725,
NO2_ppb_sensor,2.4273,1.86048,2.528475
NO2_ppm_aeroequal,0.002703,0.003165,0.002795
NO2_µg/m³_council,,4.581423,
Wind_value_km/h,0.576094,0.263145,0.576094


In [32]:
comparison_min = pd.DataFrame({
    'Final Dataset Min': final_stats['min'],
    'Final Hourly Dataset Min': final_hourly_stats['min'],
    'Master Files Combined Min': master_stats_combined['min']
})
comparison_min.reindex(expected_columns)

Unnamed: 0,Final Dataset Min,Final Hourly Dataset Min,Master Files Combined Min
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,0.0,0.75,0.0
PM2.5_mg/m³_aeroequal,0.0,8.3e-05,0.0
PM2.5_µg/m³_council,,-1.0,
PM10_µg/m³_dust_pollen_mould_spores_sensor,0.0,1.6,0.0
PM10_mg/m³_aeroequal,0.0,0.000485,0.0
PM10_µg/m³_council,,0.0,
NO2_ppb_sensor,-63.952923,-5.703395,-63.952923
NO2_ppm_aeroequal,0.0,0.0,0.0
NO2_µg/m³_council,,7.97513,
Wind_value_km/h,0.0,0.0,0.0


In [33]:
comparison_25 = pd.DataFrame({
    'Final Dataset 25%': final_stats['25%'],
    'Final Hourly Dataset 25%': final_hourly_stats['25%'],
    'Master Files Combined 25%': master_stats_combined['25%']
})
comparison_25.reindex(expected_columns)

Unnamed: 0,Final Dataset 25%,Final Hourly Dataset 25%,Master Files Combined 25%
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,4.0,4.608333,4.0
PM2.5_mg/m³_aeroequal,0.0,0.000442,0.0
PM2.5_µg/m³_council,,4.15,
PM10_µg/m³_dust_pollen_mould_spores_sensor,5.0,5.798023,5.0
PM10_mg/m³_aeroequal,0.0,0.001255,0.0
PM10_µg/m³_council,,10.8,
NO2_ppb_sensor,-1.607338,-1.661391,-2.137721
NO2_ppm_aeroequal,0.0,0.0,0.0
NO2_µg/m³_council,,10.87017,
Wind_value_km/h,0.0,0.081667,0.0


In [34]:
comparison_50 = pd.DataFrame({
    'Final Dataset 50%': final_stats['50%'],
    'Final Hourly Dataset 50%': final_hourly_stats['50%'],
    'Master Files Combined 50%': master_stats_combined['50%']
})
comparison_50.reindex(expected_columns)

Unnamed: 0,Final Dataset 50%,Final Hourly Dataset 50%,Master Files Combined 50%
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,7.0,7.133333,7.0
PM2.5_mg/m³_aeroequal,0.001,0.0007,0.001
PM2.5_µg/m³_council,,5.9,
PM10_µg/m³_dust_pollen_mould_spores_sensor,9.0,8.25,9.0
PM10_mg/m³_aeroequal,0.001,0.0022,0.001
PM10_µg/m³_council,,14.1,
NO2_ppb_sensor,0.022329,-0.005002,-0.024992
NO2_ppm_aeroequal,0.0,0.000217,0.0
NO2_µg/m³_council,,12.76116,
Wind_value_km/h,0.0,0.181667,0.0


In [35]:
comparison_75 = pd.DataFrame({
    'Final Dataset 75%': final_stats['75%'],
    'Final Hourly Dataset 75%': final_hourly_stats['75%'],
    'Master Files Combined 75%': master_stats_combined['75%']
})
comparison_75.reindex(expected_columns)

Unnamed: 0,Final Dataset 75%,Final Hourly Dataset 75%,Master Files Combined 75%
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,11.0,9.383333,10.0
PM2.5_mg/m³_aeroequal,0.002,0.001608,0.002
PM2.5_µg/m³_council,,11.1,
PM10_µg/m³_dust_pollen_mould_spores_sensor,14.0,11.641667,12.0
PM10_mg/m³_aeroequal,0.004,0.00385,0.004
PM10_µg/m³_council,,22.1,
NO2_ppb_sensor,0.743057,0.759264,0.725745
NO2_ppm_aeroequal,0.00275,0.003613,0.002
NO2_µg/m³_council,,15.347815,
Wind_value_km/h,0.3,0.459167,0.3


In [36]:
comparison_max = pd.DataFrame({
    'Final Dataset Max': final_stats['max'],
    'Final Hourly Dataset Max': final_hourly_stats['max'],
    'Master Files Combined Max': master_stats_combined['max']
})
comparison_max.reindex(expected_columns)

Unnamed: 0,Final Dataset Max,Final Hourly Dataset Max,Master Files Combined Max
PM2.5_µg/m³_combustion_particles_organic_compounds_metals_sensor,70.0,62.8,70.0
PM2.5_mg/m³_aeroequal,0.008,0.005183,0.008
PM2.5_µg/m³_council,,30.0,
PM10_µg/m³_dust_pollen_mould_spores_sensor,80.0,70.75,80.0
PM10_mg/m³_aeroequal,0.097,0.011383,0.097
PM10_µg/m³_council,,49.7,
NO2_ppb_sensor,12.758244,4.027257,12.758244
NO2_ppm_aeroequal,0.03,0.01695,0.031
NO2_µg/m³_council,,28.12809,
Wind_value_km/h,4.3,0.95625,4.3
