In [1]:
import os
import math
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/standalone/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import df_from_csv, to_be_checked, is_corr_ok, plot_hist, plot_boxes, plot_lines, df_to_csv, drop_pm25_outliers

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## PM2.5 Clean Data

In [2]:
# Load Raw Data
df_pm25_raw = df_from_csv("pm25_raw.csv")

# Check Data Types
print(df_pm25_raw.dtypes)

df_pm25_raw.head()

TANGARA_260A    float64
TANGARA_4B1A    float64
TANGARA_14D6    float64
TANGARA_2B42    float64
TANGARA_2E9A    float64
TANGARA_2FF6    float64
TANGARA_307A    float64
TANGARA_48C6    float64
TANGARA_F1AE    float64
TANGARA_06BE    float64
dtype: object


Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 00:00:00-05:00,,,,,,,,,6.0,
2023-11-10 00:00:30-05:00,8.0,6.0,6.0,,1.0,9.0,9.0,1.0,6.0,9.0
2023-11-10 00:01:00-05:00,7.0,4.0,6.0,,1.0,8.0,9.0,1.0,5.0,9.0
2023-11-10 00:01:30-05:00,8.0,5.0,7.0,,0.0,8.0,9.0,2.0,5.0,9.0
2023-11-10 00:02:00-05:00,7.0,3.0,6.0,,0.0,8.0,8.0,2.0,5.0,9.0


## Descriptive Statistics

In [3]:
# Describe Data
df_pm25_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5615.0,5663.0,1039.0,4226.0,5679.0,5692.0,5599.0,5511.0,5734.0,5388.0
mean,8.179163,9.315027,9.115496,6.977047,6.701004,7.5,8.666548,6.022319,7.931287,9.582962
std,4.535844,4.566012,3.431429,5.910071,6.001325,4.51902,5.084152,3.974745,4.72142,5.822508
min,1.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
25%,4.0,7.0,6.0,2.0,2.0,5.0,7.0,3.0,5.0,6.0
50%,8.0,9.0,9.0,7.0,5.0,7.0,8.0,6.0,7.0,9.0
75%,11.0,11.0,12.0,10.0,10.0,9.25,10.0,8.0,9.0,12.0
max,27.0,68.0,23.0,32.0,48.0,46.0,173.0,32.0,73.0,94.0


## Missing Data

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Missing Data Threshold
threshold = 90
print(f'Threshold: {threshold}%')

# For each Tangara sensor
for id_tangara_sensor in df_pm25_raw.columns:
    df_pm25_sensor = df_pm25_raw[[id_tangara_sensor]]
    # To be checked
    is_ok, data_percent, missing_data_percent = to_be_checked(df_pm25_sensor, threshold)
    if not is_ok:
        print(f"Tangara Sensor: {id_tangara_sensor}, Data: {data_percent}%, Missing: {missing_data_percent}%, To be checked")

Threshold: 90%
Tangara Sensor: TANGARA_14D6, Data: 18%, Missing: 82%, To be checked
Tangara Sensor: TANGARA_2B42, Data: 73%, Missing: 27%, To be checked


## Data Correlation

In [5]:
# Data Correlation Threshold
threshold = 0.9
print(f'Threshold: {threshold}%')

ID_REFE_TANGARA_SENSOR='TANGARA_06BE'
ID_TARG_TANGARA_SENSOR='TANGARA_2FF6'
# Reference Tangara Sensor
df_reference_sensor = df_pm25_raw[ID_REFE_TANGARA_SENSOR]
# Target Tangara Sensor
df_target_sensor = df_pm25_raw[ID_TARG_TANGARA_SENSOR]

# To be checked
if not math.isnan(df_reference_sensor.std()):
    is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
    if not is_ok:
        print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {ID_TARG_TANGARA_SENSOR}, Correlation: {corr}, To be checked")

# For each Tangara sensor
# for id_tangara_sensor in df_pm25_raw.columns:
#     # Target Tangara Sensor
#     df_target_sensor = df_pm25_raw[id_tangara_sensor]
#     # To be checked
#     is_ok, corr = is_corr_ok(df_reference_sensor, df_target_sensor, threshold)
#     if not is_ok:
#         print(f"Reference Tangara Sensor: {ID_REFE_TANGARA_SENSOR}, Target Tangara Sensor: {id_tangara_sensor}, Correlation: {corr}, To be checked")

Threshold: 0.9%
Reference Tangara Sensor: TANGARA_06BE, Target Tangara Sensor: TANGARA_2FF6, Correlation: 0.0, To be checked


## Histograms

In [6]:
# Plot Histograms
plot_hist(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Boxplots

In [7]:
# Plot Boxplots
plot_boxes(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Lineplots

In [8]:
# Plot Lineplots
plot_lines(df_pm25_raw) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Drop PM2.5 Outliers

In [9]:
# Drop PM2.5 Outliers
df_pm25_clean, resume = drop_pm25_outliers(df_pm25_raw)

In [10]:
# Describe Data
df_pm25_raw.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5541.0,5427.0,1038.0,4079.0,5467.0,5340.0,5211.0,5323.0,5355.0,5127.0
mean,7.969681,8.674774,9.102119,6.257906,5.95043,6.739326,7.794089,5.600413,7.01718,8.779598
std,4.182716,2.988458,3.405874,4.583022,4.692591,3.42848,2.66822,3.309623,2.961981,4.429641
min,1.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
25%,4.0,7.0,6.0,1.0,2.0,5.0,6.0,3.0,5.0,6.0
50%,8.0,9.0,9.0,7.0,5.0,7.0,8.0,5.0,7.0,9.0
75%,10.0,10.0,12.0,9.0,9.0,9.0,9.0,8.0,9.0,11.0
max,21.0,17.0,20.0,22.0,22.0,15.0,14.0,15.0,15.0,21.0


In [11]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5541.0,5427.0,1038.0,4079.0,5467.0,5340.0,5211.0,5323.0,5355.0,5127.0
mean,7.969681,8.674774,9.102119,6.257906,5.95043,6.739326,7.794089,5.600413,7.01718,8.779598
std,4.182716,2.988458,3.405874,4.583022,4.692591,3.42848,2.66822,3.309623,2.961981,4.429641
min,1.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
25%,4.0,7.0,6.0,1.0,2.0,5.0,6.0,3.0,5.0,6.0
50%,8.0,9.0,9.0,7.0,5.0,7.0,8.0,5.0,7.0,9.0
75%,10.0,10.0,12.0,9.0,9.0,9.0,9.0,8.0,9.0,11.0
max,21.0,17.0,20.0,22.0,22.0,15.0,14.0,15.0,15.0,21.0


## Histograms

In [12]:
# Plot Histograms
plot_hist(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Boxplots

In [13]:
# Plot Boxplots
plot_boxes(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Lineplots

In [14]:
# Plot Lineplots
plot_lines(df_pm25_clean) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


In [15]:
# Save PM2.5 Data Frame Sensors into CSV file
df_to_csv(df_pm25_clean, "pm25_clean.csv", datafolder='1_clean')