In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[1]}/standalone/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import df_from_csv, df_pm25_to_df_aqi, plot_hist, plot_boxes, plot_lines, df_to_csv

PM2.5: 35.9, AQI: 102
PM2.5: 35.9, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 35.5, Max: 55.4
AQI: 102, Measure Level: MeasureLevels.UNHEALTHY_FOR_SENSITIVE_GROUPS, Range Values: Min: 101, Max: 150


## PM2.5 to AQI

In [2]:
# Load Raw Data
df_pm25_clean = df_from_csv("pm25_clean.csv", datafolder="1_clean")

# Check Data Types
print(df_pm25_clean.dtypes)

# Sorting by Descending
df_pm25_clean = df_pm25_clean.sort_values(by='DATETIME', ascending=False)

df_pm25_clean.head()

TANGARA_260A    float64
TANGARA_4B1A    float64
TANGARA_14D6    float64
TANGARA_2B42    float64
TANGARA_2E9A    float64
TANGARA_2FF6    float64
TANGARA_307A    float64
TANGARA_48C6    float64
TANGARA_F1AE    float64
TANGARA_06BE    float64
dtype: object


Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 23:59:30-05:00,,,,8.0,,,,,7.0,
2023-11-10 23:59:00-05:00,,,,7.0,,,,,7.0,
2023-11-10 23:58:30-05:00,6.0,6.0,,7.0,3.0,6.0,7.0,,6.0,6.0
2023-11-10 23:58:00-05:00,6.0,6.0,,6.0,3.0,5.0,7.0,7.0,7.0,7.0
2023-11-10 23:57:30-05:00,6.0,6.0,,7.0,2.0,5.0,6.0,6.0,6.0,7.0


## Descriptive Statistics

In [3]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,5490.0,5037.0,3651.0,4175.0,5550.0,5679.0,3142.0,5266.0,5616.0,5434.0
mean,9.08306,9.471511,11.036154,7.267784,6.025946,7.367142,7.475812,5.734333,9.244302,9.391976
std,4.279498,3.08778,4.265543,4.366055,3.948631,3.109358,2.447387,3.013592,4.25929,3.870799
min,1.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
25%,6.0,7.0,8.0,3.0,3.0,5.0,6.0,4.0,6.0,6.0
50%,9.0,9.0,11.0,8.0,6.0,8.0,8.0,6.0,8.0,10.0
75%,12.0,11.0,13.0,10.0,9.0,9.0,9.0,7.0,11.0,12.0
max,21.0,18.0,23.0,20.0,18.0,15.0,13.0,14.0,21.0,21.0


## Group by hour on DATETIME and resample by mean

In [4]:
# Resample mean grouping by hour on DATETIME
df_pm25_clean = df_pm25_clean.resample('H').mean()

# Sorting by Descending
df_pm25_clean = df_pm25_clean.sort_values(by='DATETIME', ascending=False)

df_pm25_clean.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 23:00:00-05:00,3.974359,8.061404,,6.196581,2.771186,6.568966,6.457627,5.149123,8.05,6.632653
2023-11-10 22:00:00-05:00,6.983333,9.376147,,8.152542,1.403361,6.596639,6.091667,5.45614,6.983193,7.65
2023-11-10 21:00:00-05:00,8.183333,8.289474,,8.691667,4.210084,8.621849,8.87395,6.025862,7.042373,10.596774
2023-11-10 20:00:00-05:00,2.815126,6.808696,,4.208333,1.621849,5.470588,7.394958,1.956897,4.428571,6.339286
2023-11-10 19:00:00-05:00,2.737288,6.788136,,0.983333,1.655172,1.271186,4.330435,1.964912,5.083333,4.181034


## Descriptive Statistics

In [5]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,48.0,48.0,33.0,36.0,48.0,48.0,28.0,48.0,48.0,48.0
mean,9.213977,9.751358,11.269893,7.317851,6.17368,7.36869,7.536832,6.039377,9.320803,9.444804
std,3.973722,2.385494,3.840427,4.113254,3.876135,2.919606,2.180317,2.954708,3.724707,3.550128
min,2.737288,4.713043,4.598291,0.983333,1.043478,1.233333,2.65,1.241071,3.7,2.444444
25%,6.053099,8.258418,8.321739,3.032372,2.727962,5.768124,6.366137,4.15,6.405983,6.606308
50%,9.484249,9.316444,11.08547,8.811351,5.547322,8.246342,7.807959,6.025975,8.645833,10.098095
75%,12.141725,11.100932,12.854701,10.1625,8.695431,9.083333,9.460644,7.228261,11.402331,11.440395
max,18.847826,16.393939,20.770833,15.387931,14.711864,12.610169,10.408163,13.666667,19.468468,16.421053


## Movil 24h, Rolling Function in Pandas

In [6]:
# Movil 24h, Rolling Function in Pandas
df_pm25_clean = df_pm25_clean.rolling(window=24, min_periods=1).mean()

df_pm25_clean.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 23:00:00-05:00,3.974359,8.061404,,6.196581,2.771186,6.568966,6.457627,5.149123,8.05,6.632653
2023-11-10 22:00:00-05:00,5.478846,8.718775,,7.174562,2.087274,6.582802,6.274647,5.302632,7.516597,7.141327
2023-11-10 21:00:00-05:00,6.380342,8.575675,,7.680263,2.794877,7.262484,7.141081,5.543708,7.358522,8.293142
2023-11-10 20:00:00-05:00,5.489038,8.13393,,6.812281,2.50162,6.81451,7.20455,4.647005,6.626034,7.804678
2023-11-10 19:00:00-05:00,4.938688,7.864771,,5.646491,2.332331,5.705846,6.629727,4.110587,6.317494,7.079949


## Descriptive Statistics

In [7]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,48.0,48.0,33.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,7.677416,9.123008,10.432024,6.141946,5.151956,6.546777,7.360895,5.221846,8.276677,8.360478
std,1.812189,0.561562,1.165019,1.163092,1.589437,1.079799,0.766682,0.878616,1.220066,1.343623
min,3.974359,7.864771,8.912329,3.717944,2.087274,4.018218,5.368686,3.387599,5.737711,5.471387
25%,6.374741,8.872082,9.397904,5.68435,4.077021,6.292905,6.988789,4.968556,7.645941,7.554775
50%,7.891048,9.09242,10.118687,5.801493,6.062164,6.622392,7.497641,5.250698,8.402952,8.488849
75%,8.919722,9.321261,11.370437,6.802966,6.173943,7.306791,7.915452,5.521654,8.907168,9.342155
max,10.769881,10.587079,12.552707,8.241135,6.578755,8.353206,8.40409,6.890537,10.549598,10.729182


## Filter just the last 24 hours

In [8]:
# Sorting by Ascending
df_pm25_clean = df_pm25_clean.sort_values(by='DATETIME', ascending=True)

# Filter just the las 24 hours
df_pm25_clean = df_pm25_clean.loc[df_pm25_clean.index >= df_pm25_clean.index[0] + pd.Timedelta(hours=24)]

df_pm25_clean.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 00:00:00-05:00,7.658073,8.915637,9.154996,5.68435,5.768606,6.384174,7.39468,5.188217,8.092009,8.160426
2023-11-10 01:00:00-05:00,7.75245,9.098359,9.297234,5.68435,5.958034,6.381348,7.432788,5.169558,8.148545,8.219283
2023-11-10 02:00:00-05:00,7.865441,9.086482,9.968512,5.68435,6.055368,6.437476,7.50105,5.21679,8.350752,8.31675
2023-11-10 03:00:00-05:00,7.916656,9.108252,10.557897,5.68435,6.105624,6.46471,7.57874,5.242021,8.455153,8.401963
2023-11-10 04:00:00-05:00,8.000406,9.066273,11.303664,5.68435,6.193678,6.50659,7.549918,5.296296,8.593165,8.480395


In [9]:
df_pm25_clean.tail()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 19:00:00-05:00,4.938688,7.864771,,5.646491,2.332331,5.705846,6.629727,4.110587,6.317494,7.079949
2023-11-10 20:00:00-05:00,5.489038,8.13393,,6.812281,2.50162,6.81451,7.20455,4.647005,6.626034,7.804678
2023-11-10 21:00:00-05:00,6.380342,8.575675,,7.680263,2.794877,7.262484,7.141081,5.543708,7.358522,8.293142
2023-11-10 22:00:00-05:00,5.478846,8.718775,,7.174562,2.087274,6.582802,6.274647,5.302632,7.516597,7.141327
2023-11-10 23:00:00-05:00,3.974359,8.061404,,6.196581,2.771186,6.568966,6.457627,5.149123,8.05,6.632653


## Descriptive Statistics

In [10]:
# Describe Data
df_pm25_clean.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,24.0,24.0,9.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,6.301586,8.78307,11.057404,5.49301,4.127104,5.778728,6.812579,4.651466,7.428023,7.328684
std,1.374652,0.390634,1.351694,0.974263,1.712088,0.960349,0.666732,0.701735,0.979198,1.02503
min,3.974359,7.864771,9.154996,3.717944,2.087274,4.018218,5.368686,3.387599,5.737711,5.471387
25%,4.93856,8.664848,9.968512,5.005208,2.353785,5.005458,6.411882,4.095744,6.548899,6.551331
50%,6.36914,8.900934,11.303664,5.68435,3.929954,6.25087,6.930516,4.939917,7.6325,7.452665
75%,7.681667,9.089451,12.230769,5.68435,5.971742,6.489395,7.41495,5.223098,8.180407,8.293653
max,8.04057,9.296463,12.552707,7.680263,6.245353,7.262484,7.57874,5.543708,8.593165,8.525531


## PM2.5 to AQI

In [11]:
# AQI DataFrame Sensors
df_aqi = df_pm25_to_df_aqi(df_pm25_clean)
df_aqi = df_aqi.astype('float')
df_aqi.head()

Unnamed: 0_level_0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-11-10 00:00:00-05:00,33.0,38.0,39.0,24.0,25.0,27.0,31.0,22.0,34.0,35.0
2023-11-10 01:00:00-05:00,33.0,38.0,39.0,24.0,25.0,27.0,31.0,22.0,34.0,35.0
2023-11-10 02:00:00-05:00,33.0,38.0,42.0,24.0,26.0,27.0,32.0,22.0,36.0,35.0
2023-11-10 03:00:00-05:00,33.0,38.0,45.0,24.0,26.0,28.0,32.0,22.0,36.0,36.0
2023-11-10 04:00:00-05:00,34.0,38.0,48.0,24.0,26.0,28.0,32.0,23.0,36.0,36.0


## Describe Statistics

In [12]:
# Describe Data
df_aqi.describe()

Unnamed: 0,TANGARA_260A,TANGARA_4B1A,TANGARA_14D6,TANGARA_2B42,TANGARA_2E9A,TANGARA_2FF6,TANGARA_307A,TANGARA_48C6,TANGARA_F1AE,TANGARA_06BE
count,24.0,24.0,9.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,26.708333,36.958333,46.666667,23.25,17.666667,24.625,29.0,19.958333,31.416667,31.125
std,5.78964,1.627993,5.612486,4.234948,7.093577,4.084036,2.68652,2.881412,4.179834,4.347038
min,17.0,33.0,39.0,16.0,9.0,17.0,23.0,15.0,24.0,23.0
25%,21.0,36.75,42.0,21.0,10.0,21.0,27.75,18.0,27.75,27.75
50%,27.0,38.0,48.0,24.0,17.0,26.5,29.5,21.0,32.5,32.0
75%,33.0,38.0,52.0,24.0,25.0,28.0,31.0,22.0,34.25,35.0
max,34.0,39.0,53.0,33.0,26.0,31.0,32.0,23.0,36.0,36.0


## Histograms

In [13]:
# Plot Histograms
plot_hist(df_aqi) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Boxplots

In [14]:
# Plot Boxplots
plot_boxes(df_aqi) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


## Lineplots

In [15]:
# Plot Lineplots
plot_lines(df_aqi) if bool(os.getenv("PLOT_CHARTS", None)) else print("PLOT_CHARTS:", False, "Plot charts were ignored")

PLOT_CHARTS: False Plot charts were ignored


In [16]:
# Save AQI Data Frame Sensors into CSV file
df_to_csv(df_aqi, "aqi.csv", datafolder='2_features')