In [1]:
import os
import math
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/standalone/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import df_from_csv, to_be_checked, is_corr_ok, plot_hist, plot_boxes, plot_lines, df_to_csv, drop_pm25_outliers

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## PM2.5 Clean Data

In [2]:
# Load Raw Data
df_pm25_raw = df_from_csv("pm25_raw.csv")

# Check Data Types
print(df_pm25_raw.dtypes)

df_pm25_raw.head()

TANGARA_260A    float64
TANGARA_2BDE    float64
TANGARA_39D6    float64
TANGARA_3B7E    float64
TANGARA_3BEA    float64
TANGARA_421A    float64
TANGARA_5D62    float64
TANGARA_5636    float64
TANGARA_D282    float64
TANGARA_4B1A    float64
TANGARA_14D6    float64
TANGARA_2B42    float64
TANGARA_2E9A    float64
TANGARA_2FF6    float64
TANGARA_307A    float64
TANGARA_48C6    float64
TANGARA_F1AE    float64
TANGARA_06BE    float64
dtype: object


Unnamed: 0_level_0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-11-17 00:00:00-05:00,,,,,,,,,,,,10.0,,,,,8.0,
2023-11-17 00:00:30-05:00,11.0,,,,,,,,,4.0,7.0,10.0,5.0,9.0,10.0,4.0,7.0,9.0
2023-11-17 00:01:00-05:00,11.0,,,,,,,,,4.0,9.0,10.0,7.0,8.0,10.0,8.0,7.0,9.0
2023-11-17 00:01:30-05:00,9.0,,,,,,,,,4.0,10.0,11.0,11.0,7.0,10.0,6.0,8.0,8.0
2023-11-17 00:02:00-05:00,9.0,,,,,,,,,5.0,9.0,11.0,10.0,8.0,10.0,5.0,7.0,8.0


## Descriptive Statistics

In [3]:
# Describe Data
df_pm25_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5727.0,1359.0,1.0,1292.0,1376.0,1425.0,2175.0,1304.0,1435.0,5726.0,5665.0,4978.0,4810.0,5718.0,5711.0,5733.0,5739.0,5715.0
mean,8.64938,5.729213,19.0,4.528638,4.756541,4.750877,5.322299,6.046779,5.514286,11.699616,9.561342,8.166533,7.926403,7.883176,9.531956,7.238095,8.528664,9.628696
std,5.12304,3.547318,,2.849156,3.045765,3.036131,4.110204,3.497411,3.319195,16.372879,5.945327,5.15153,5.814738,4.034657,13.733579,7.289115,7.589274,4.943717
min,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
25%,5.0,3.0,19.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,5.0,5.0,3.0,4.0,6.0
50%,8.0,5.0,19.0,4.0,4.0,4.0,4.0,5.0,5.0,8.0,9.0,8.0,7.0,8.0,8.0,6.0,7.0,9.0
75%,11.0,7.0,19.0,6.0,6.0,6.0,6.0,8.0,7.0,15.0,13.0,10.0,11.0,10.0,10.0,9.0,11.0,12.0
max,33.0,18.0,19.0,18.0,18.0,17.0,63.0,17.0,18.0,310.0,57.0,95.0,37.0,22.0,339.0,195.0,264.0,61.0


## Missing Data

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Missing Data Threshold
threshold = 90
print(f'Threshold: {threshold}%')

# For each Tangara sensor
for id_tangara_sensor in df_pm25_raw.columns:
    df_pm25_sensor = df_pm25_raw[[id_tangara_sensor]]
    # To be checked
    is_ok, data_percent, missing_data_percent = to_be_checked(df_pm25_sensor, threshold)
    if not is_ok:
        print(f"Tangara Sensor: {id_tangara_sensor}, Data: {data_percent}%, Missing: {missing_data_percent}%, To be checked")

Threshold: 90%
Tangara Sensor: TANGARA_2BDE, Data: 24%, Missing: 76%, To be checked
Tangara Sensor: TANGARA_39D6, Data: 0%, Missing: 100%, To be checked
Tangara Sensor: TANGARA_3B7E, Data: 22%, Missing: 78%, To be checked
Tangara Sensor: TANGARA_3BEA, Data: 24%, Missing: 76%, To be checked
Tangara Sensor: TANGARA_421A, Data: 25%, Missing: 75%, To be checked
Tangara Sensor: TANGARA_5D62, Data: 38%, Missing: 62%, To be checked
Tangara Sensor: TANGARA_5636, Data: 23%, Missing: 77%, To be checked
Tangara Sensor: TANGARA_D282, Data: 25%, Missing: 75%, To be checked
Tangara Sensor: TANGARA_2B42, Data: 86%, Missing: 14%, To be checked
Tangara Sensor: TANGARA_2E9A, Data: 84%, Missing: 16%, To be checked


## Data Correlation

In [5]:
# Data Correlation Threshold
threshold = 0.9
print(f'Threshold: {threshold}%')

ID_REFE_TANGARA_SENSOR='TANGARA_06BE'
ID_TARG_TANGARA_SENSOR='TANGARA_2FF6'
# Reference Tangara Sensor
df_reference_sensor = df_pm25_raw[ID_REFE_TANGARA_SENSOR]
# Target Tangara Sensor
df_target_sensor = df_pm25_raw[ID_TARG_TANGARA_SENSOR]

# To be checked
if not math.isnan(df_reference_sensor.std()):
    is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
    if not is_ok:
        print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {ID_TARG_TANGARA_SENSOR}, Correlation: {corr}, To be checked")

# For each Tangara sensor
# for id_tangara_sensor in df_pm25_raw.columns:
#     # Target Tangara Sensor
#     df_target_sensor = df_pm25_raw[id_tangara_sensor]
#     # To be checked
#     is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
#     if not is_ok:
#         print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {id_tangara_sensor}, Correlation: {corr}, To be checked")

Threshold: 0.9%
Reference Tangara Sensor: TANGARA_06BE, Target Tangara Sensor: TANGARA_2FF6, Correlation: 0.0, To be checked


## Histograms

In [6]:
# Plot Histograms
plot_hist(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Boxplots

In [7]:
# Plot Boxplots
plot_boxes(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Lineplots

In [8]:
# Plot Lineplots
plot_lines(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Drop PM2.5 Outliers

In [9]:
# Drop PM2.5 Outliers
df_pm25_clean, resume = drop_pm25_outliers(df_pm25_raw)

In [10]:
# Describe Data
df_pm25_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5592.0,1288.0,1.0,1267.0,1285.0,1314.0,2045.0,1294.0,1400.0,5544.0,5557.0,4683.0,4742.0,5585.0,5435.0,5527.0,5582.0,5562.0
mean,8.286302,5.20264,19.0,4.321231,4.17821,4.08828,4.605379,5.967543,5.275,9.75974,9.175814,7.331839,7.671447,7.623456,7.764121,6.363307,7.907381,9.238403
std,4.591809,2.810253,,2.447919,2.149076,2.051834,2.018086,3.391979,2.983529,6.905015,5.26988,3.873654,5.43897,3.707354,3.385044,4.104509,4.585627,4.35132
min,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
25%,5.0,3.0,19.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,5.0,5.0,3.0,4.0,6.0
50%,8.0,4.0,19.0,4.0,4.0,3.0,4.0,5.0,4.0,8.0,9.0,8.0,7.0,8.0,8.0,6.0,7.0,9.0
75%,11.0,7.0,19.0,6.0,5.0,5.0,6.0,8.0,7.0,14.0,12.0,10.0,10.0,10.0,10.0,9.0,11.0,12.0
max,20.0,13.0,19.0,12.0,10.0,10.0,10.0,15.0,13.0,31.0,25.0,17.0,23.0,17.0,17.0,18.0,21.0,21.0


In [11]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_2BDE,TANGARA_39D6,TANGARA_3B7E,TANGARA_3BEA,TANGARA_421A,TANGARA_5D62,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5592.0,1288.0,1.0,1267.0,1285.0,1314.0,2045.0,1294.0,1400.0,5544.0,5557.0,4683.0,4742.0,5585.0,5435.0,5527.0,5582.0,5562.0
mean,8.286302,5.20264,19.0,4.321231,4.17821,4.08828,4.605379,5.967543,5.275,9.75974,9.175814,7.331839,7.671447,7.623456,7.764121,6.363307,7.907381,9.238403
std,4.591809,2.810253,,2.447919,2.149076,2.051834,2.018086,3.391979,2.983529,6.905015,5.26988,3.873654,5.43897,3.707354,3.385044,4.104509,4.585627,4.35132
min,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
25%,5.0,3.0,19.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,5.0,5.0,3.0,4.0,6.0
50%,8.0,4.0,19.0,4.0,4.0,3.0,4.0,5.0,4.0,8.0,9.0,8.0,7.0,8.0,8.0,6.0,7.0,9.0
75%,11.0,7.0,19.0,6.0,5.0,5.0,6.0,8.0,7.0,14.0,12.0,10.0,10.0,10.0,10.0,9.0,11.0,12.0
max,20.0,13.0,19.0,12.0,10.0,10.0,10.0,15.0,13.0,31.0,25.0,17.0,23.0,17.0,17.0,18.0,21.0,21.0


## Histograms

In [12]:
# Plot Histograms
plot_hist(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Boxplots

In [13]:
# Plot Boxplots
plot_boxes(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Lineplots

In [14]:
# Plot Lineplots
plot_lines(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


In [15]:
# Save PM2.5 Data Frame Sensors into CSV file
df_to_csv(df_pm25_clean, "pm25_clean.csv", datafolder='1_clean')