In [1]:
import os
import pandas as pd
from babel.dates import format_datetime
import holidays
from dotenv import load_dotenv

# Explicitly providing path to '.env'
from pathlib import Path  # Python 3.6+ only
# Load .env variables
_ = load_dotenv(dotenv_path=f"{Path().resolve().parents[0]}/.env")

# with the new api
from importnb import imports
with imports("ipynb"):
    from utils import df_to_csv, df_from_csv, to_file_name

In [2]:
# Year
YEAR = os.getenv("YEAR", None)
# Start Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
START_ISO8601_DATETIME = os.getenv("START_ISO8601_DATETIME", None)
# End Date Time ISO 8601 Format, TZ='America/Bogota' -05:00
END_ISO8601_DATETIME = os.getenv("END_ISO8601_DATETIME", None)

# Datasets

## Tangara Sensors

In [3]:
# Tangaras from CSV
file_name = to_file_name("tangaras", START_ISO8601_DATETIME, END_ISO8601_DATETIME)
df_tangaras = df_from_csv(file_name, dtindex=False)
print(f"Size: {df_tangaras.size}")
df_tangaras.info()
df_tangaras.describe()

[32m2024-09-20 18:26:27.493[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m448[0m - [34m[1mRun to_file_name:[0m
[32m2024-09-20 18:26:27.495[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m449[0m - [34m[1mFile Name: tangaras_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
[32m2024-09-20 18:26:27.497[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m263[0m - [34m[1mRun df_from_csv:[0m
[32m2024-09-20 18:26:27.498[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m264[0m - [34m[1mLoad DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-mlds-unal-2024/src/tangara/data/0_raw/tangaras_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m


Size: 228
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           38 non-null     object 
 1   GEOHASH      38 non-null     object 
 2   MAC          38 non-null     object 
 3   GEOLOCATION  38 non-null     object 
 4   LATITUDE     38 non-null     float64
 5   LONGITUDE    38 non-null     float64
dtypes: float64(2), object(4)
memory usage: 1.9+ KB


Unnamed: 0,LATITUDE,LONGITUDE
count,38.0,38.0
mean,3.415194,-76.518583
std,0.036525,0.015978
min,3.335037,-76.541061
25%,3.398209,-76.525955
50%,3.398209,-76.525955
75%,3.439407,-76.514282
max,3.497086,-76.464157


In [4]:
df_tangaras.sample(1)

Unnamed: 0,ID,GEOHASH,MAC,GEOLOCATION,LATITUDE,LONGITUDE
19,TANGARA_D282,d29e6fj,D29ESP32DE8D282,3.3954620361328125 -76.51222229003906,3.395462,-76.512222


## PM2.5

In [5]:
# PM2.5 from CSV
file_name = to_file_name("pm25_raw", START_ISO8601_DATETIME, END_ISO8601_DATETIME)
df_pm25_raw = df_from_csv(file_name, dtindex=False)
print(f"Size: {df_pm25_raw.size}")
df_pm25_raw.info()
df_pm25_raw.describe()

[32m2024-09-20 18:26:27.648[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m448[0m - [34m[1mRun to_file_name:[0m
[32m2024-09-20 18:26:27.650[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m449[0m - [34m[1mFile Name: pm25_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
[32m2024-09-20 18:26:27.743[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m263[0m - [34m[1mRun df_from_csv:[0m
[32m2024-09-20 18:26:27.744[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m264[0m - [34m[1mLoad DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-mlds-unal-2024/src/tangara/data/0_raw/pm25_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m


Size: 2410560
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89280 entries, 0 to 89279
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DATETIME      89280 non-null  object 
 1   TANGARA_B7BE  67718 non-null  float64
 2   TANGARA_C752  48966 non-null  float64
 3   TANGARA_A682  85089 non-null  float64
 4   TANGARA_ADD6  64851 non-null  float64
 5   TANGARA_B9CA  72173 non-null  float64
 6   TANGARA_BC5A  88271 non-null  float64
 7   TANGARA_BCC6  72479 non-null  float64
 8   TANGARA_BD5E  48948 non-null  float64
 9   TANGARA_BE12  48922 non-null  float64
 10  TANGARA_2BDE  85978 non-null  float64
 11  TANGARA_3B7E  47921 non-null  float64
 12  TANGARA_421A  34281 non-null  float64
 13  TANGARA_79CA  62526 non-null  float64
 14  TANGARA_1402  72524 non-null  float64
 15  TANGARA_157E  65778 non-null  float64
 16  TANGARA_1DE2  72288 non-null  float64
 17  TANGARA_298A  88393 non-null  float64
 18  TANGARA_3376

Unnamed: 0,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,TANGARA_2BDE,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
count,67718.0,48966.0,85089.0,64851.0,72173.0,88271.0,72479.0,48948.0,48922.0,85978.0,...,88393.0,88215.0,67862.0,39338.0,18174.0,87592.0,54523.0,85874.0,87122.0,69703.0
mean,11.615331,15.046747,12.198827,13.658047,11.915689,21.498646,11.427641,15.547479,16.062058,13.95621,...,11.839478,13.329638,18.01674,11.834664,10.48806,10.504041,14.839536,11.004262,19.009447,11.192287
std,24.802586,13.608927,24.323147,55.037732,21.027937,188.435675,17.67803,13.710971,13.822078,12.594924,...,18.088591,20.372531,104.790482,10.868485,9.612211,7.326448,30.336486,7.899539,40.228386,10.825562
min,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
25%,7.0,9.0,6.0,7.0,7.0,7.0,6.0,9.0,10.0,7.0,...,6.0,7.0,7.0,7.0,5.0,6.0,8.0,6.0,9.0,6.0
50%,10.0,13.0,10.0,11.0,10.0,11.0,9.0,13.0,14.0,12.0,...,10.0,11.0,10.0,11.0,8.0,9.0,12.0,9.0,14.0,10.0
75%,14.0,17.0,15.0,16.0,14.0,16.0,13.0,18.0,19.0,18.0,...,14.0,16.0,15.0,15.0,14.0,14.0,17.0,14.0,21.0,14.0
max,3394.0,606.0,3416.0,8106.0,2661.0,10670.0,1970.0,730.0,472.0,1071.0,...,2093.0,2189.0,12962.0,521.0,117.0,235.0,5444.0,171.0,5061.0,943.0


In [6]:
df_pm25_raw.sample(1)

Unnamed: 0,DATETIME,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
74013,2024-08-26 16:46:30-05:00,18.0,17.0,15.0,19.0,15.0,18.0,14.0,18.0,16.0,...,16.0,16.0,12.0,,,13.0,,12.0,16.0,


## Temperature

In [7]:
# Temperature from CSV
file_name = to_file_name("temp_raw", START_ISO8601_DATETIME, END_ISO8601_DATETIME)
df_temp_raw = df_from_csv(file_name, dtindex=False)
print(f"Size: {df_temp_raw.size}")
df_temp_raw.info()
df_temp_raw.describe()

[32m2024-09-20 18:26:28.070[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m448[0m - [34m[1mRun to_file_name:[0m
[32m2024-09-20 18:26:28.071[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m449[0m - [34m[1mFile Name: temp_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
[32m2024-09-20 18:26:28.176[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m263[0m - [34m[1mRun df_from_csv:[0m
[32m2024-09-20 18:26:28.177[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m264[0m - [34m[1mLoad DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-mlds-unal-2024/src/tangara/data/0_raw/temp_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m


Size: 2410560
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89280 entries, 0 to 89279
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DATETIME      89280 non-null  object 
 1   TANGARA_B7BE  67718 non-null  float64
 2   TANGARA_C752  48966 non-null  float64
 3   TANGARA_A682  85089 non-null  float64
 4   TANGARA_ADD6  64851 non-null  float64
 5   TANGARA_B9CA  72173 non-null  float64
 6   TANGARA_BC5A  88271 non-null  float64
 7   TANGARA_BCC6  72479 non-null  float64
 8   TANGARA_BD5E  48948 non-null  float64
 9   TANGARA_BE12  48922 non-null  float64
 10  TANGARA_2BDE  85978 non-null  float64
 11  TANGARA_3B7E  47921 non-null  float64
 12  TANGARA_421A  34281 non-null  float64
 13  TANGARA_79CA  62526 non-null  float64
 14  TANGARA_1402  72524 non-null  float64
 15  TANGARA_157E  65778 non-null  float64
 16  TANGARA_1DE2  72288 non-null  float64
 17  TANGARA_298A  88393 non-null  float64
 18  TANGARA_3376

Unnamed: 0,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,TANGARA_2BDE,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
count,67718.0,48966.0,85089.0,64851.0,72173.0,88271.0,72479.0,48948.0,48922.0,85978.0,...,88393.0,88215.0,67862.0,39338.0,18174.0,87592.0,54523.0,85874.0,87122.0,69703.0
mean,29.57366,31.748919,29.456857,29.289646,33.792593,31.553092,32.639464,31.015593,31.193546,29.434215,...,33.043649,30.330012,31.001245,31.090777,30.575697,31.293741,29.768692,31.254932,29.565707,33.202716
std,3.079509,1.201243,3.074971,2.40207,1.285908,1.176026,1.164331,1.321413,1.126423,2.360606,...,1.18158,1.258097,4.160348,3.836545,5.294499,3.526739,3.57203,4.613316,3.735069,3.799593
min,21.72,28.72,22.18,22.91,29.56,28.44,29.42,27.79,28.31,23.53,...,29.61,26.52,23.4,24.62,23.61,24.61,23.61,23.72,23.0,24.65
25%,28.27,30.88,27.1,28.02,32.86,30.69,31.84,30.08,30.4,27.48,...,32.21,29.48,27.58,27.79,26.17,28.52,27.0,27.32,26.31,29.96
50%,30.07,31.77,29.71,29.9,33.86,31.53,32.66,31.07,31.21,29.45,...,33.22,30.36,29.95,30.12,28.48,30.52,28.93,30.09,29.01,32.44
75%,31.65,32.73,31.52,31.06,34.77,32.45,33.51,32.13,32.1,31.41,...,33.92,31.3,34.76,34.62,35.28,33.54,31.85,35.17,33.02,36.76
max,37.84,34.11,38.75,33.37,36.52,34.24,35.46,33.5,33.23,34.91,...,35.58,33.48,41.03,41.74,44.04,41.54,42.05,42.15,37.59,43.31


In [8]:
df_temp_raw.sample(1)

Unnamed: 0,DATETIME,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
18171,2024-08-07 07:25:30-05:00,28.65,,29.11,28.94,30.96,29.77,31.33,,,...,,28.99,26.63,26.43,24.99,26.4,26.42,25.37,24.16,


## Humidity

In [9]:
# Humidity from CSV
file_name = to_file_name("hum_raw", START_ISO8601_DATETIME, END_ISO8601_DATETIME)
df_hum_raw = df_from_csv(file_name, dtindex=False)
print(f"Size: {df_hum_raw.size}")
df_hum_raw.info()
df_hum_raw.describe()

[32m2024-09-20 18:26:28.506[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m448[0m - [34m[1mRun to_file_name:[0m
[32m2024-09-20 18:26:28.508[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m449[0m - [34m[1mFile Name: hum_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
[32m2024-09-20 18:26:28.602[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m263[0m - [34m[1mRun df_from_csv:[0m
[32m2024-09-20 18:26:28.603[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_from_csv[0m:[36m264[0m - [34m[1mLoad DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-mlds-unal-2024/src/tangara/data/0_raw/hum_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m


Size: 2410560
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89280 entries, 0 to 89279
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DATETIME      89280 non-null  object 
 1   TANGARA_B7BE  67718 non-null  float64
 2   TANGARA_C752  48966 non-null  float64
 3   TANGARA_A682  85089 non-null  float64
 4   TANGARA_ADD6  64851 non-null  float64
 5   TANGARA_B9CA  72173 non-null  float64
 6   TANGARA_BC5A  88271 non-null  float64
 7   TANGARA_BCC6  72479 non-null  float64
 8   TANGARA_BD5E  48948 non-null  float64
 9   TANGARA_BE12  48922 non-null  float64
 10  TANGARA_2BDE  85978 non-null  float64
 11  TANGARA_3B7E  47921 non-null  float64
 12  TANGARA_421A  34281 non-null  float64
 13  TANGARA_79CA  62526 non-null  float64
 14  TANGARA_1402  72524 non-null  float64
 15  TANGARA_157E  65778 non-null  float64
 16  TANGARA_1DE2  72288 non-null  float64
 17  TANGARA_298A  88393 non-null  float64
 18  TANGARA_3376

Unnamed: 0,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,TANGARA_2BDE,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
count,67718.0,48966.0,85089.0,64851.0,72173.0,88271.0,72479.0,48948.0,48922.0,85978.0,...,88393.0,88215.0,67862.0,39338.0,18174.0,87592.0,54523.0,85874.0,87122.0,69703.0
mean,53.417248,43.625148,52.330193,51.653218,40.785404,47.81059,44.359093,47.573522,47.531564,55.10358,...,43.807749,49.706579,54.739585,51.74345,62.1529,50.84771,62.009522,56.561325,70.297415,46.547423
std,8.392236,3.703125,8.526505,5.967868,4.543922,4.752289,3.968365,3.848655,4.203994,8.529958,...,3.607454,4.360281,11.146431,11.013942,12.185364,9.902086,10.14799,12.098309,10.470867,8.515772
min,26.92,34.22,23.38,34.32,31.41,35.39,31.36,39.01,33.16,32.66,...,32.87,40.24,26.87,27.12,35.06,23.5,30.3,28.6,39.96,25.32
25%,47.53,40.71,46.85,47.595,37.59,44.09,41.29,44.62,44.27,47.85,...,41.12,46.23,45.19,41.94,50.8,43.68,54.95,45.98,62.03,39.395
50%,53.21,43.21,53.26,51.47,39.91,47.51,44.16,47.22,47.17,54.96,...,43.74,49.51,56.48,52.32,65.39,51.76,63.33,58.37,71.06,46.67
75%,58.77,46.37,58.2,55.21,43.13,51.02,47.17,50.36,50.65,61.57,...,46.22,53.02,63.56,61.58,70.97,58.27,69.66,66.3,78.63,53.75
max,78.07,53.46,75.51,67.19,55.55,62.62,57.86,57.56,59.51,75.68,...,56.68,60.74,77.63,71.67,83.26,73.21,84.55,81.29,91.96,69.89


In [10]:
df_hum_raw.sample(1)

Unnamed: 0,DATETIME,TANGARA_B7BE,TANGARA_C752,TANGARA_A682,TANGARA_ADD6,TANGARA_B9CA,TANGARA_BC5A,TANGARA_BCC6,TANGARA_BD5E,TANGARA_BE12,...,TANGARA_298A,TANGARA_3376,TANGARA_5636,TANGARA_D282,TANGARA_4B1A,TANGARA_1106,TANGARA_1282,TANGARA_14D6,TANGARA_1A1E,TANGARA_2492
76118,2024-08-27 10:19:00-05:00,,45.57,47.35,51.53,40.23,47.97,46.42,50.37,49.57,...,46.17,53.2,,,,46.98,,45.91,58.72,42.72


## Tangara Dataset

In [11]:
# Crear un DataFrame vacío para almacenar el resultado final
df_tangara_dataset = pd.DataFrame(columns=['DATETIME','YEAR','MONTH','DAY_OF_WEEK','IS_WEEKEND','IS_HOLIDAY','SENSOR_ID','PM25','TEMPERATURE','HUMIDITY', 'GEOLOCATION','CARDINAL_POINT'])

print(f"Size: {df_tangara_dataset.size}")
df_tangara_dataset.info()
df_tangara_dataset.describe()


Size: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   DATETIME        0 non-null      object
 1   YEAR            0 non-null      object
 2   MONTH           0 non-null      object
 3   DAY_OF_WEEK     0 non-null      object
 4   IS_WEEKEND      0 non-null      object
 5   IS_HOLIDAY      0 non-null      object
 6   SENSOR_ID       0 non-null      object
 7   PM25            0 non-null      object
 8   TEMPERATURE     0 non-null      object
 9   HUMIDITY        0 non-null      object
 10  GEOLOCATION     0 non-null      object
 11  CARDINAL_POINT  0 non-null      object
dtypes: object(12)
memory usage: 124.0+ bytes


Unnamed: 0,DATETIME,YEAR,MONTH,DAY_OF_WEEK,IS_WEEKEND,IS_HOLIDAY,SENSOR_ID,PM25,TEMPERATURE,HUMIDITY,GEOLOCATION,CARDINAL_POINT
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,,,,,,,,,
freq,,,,,,,,,,,,


In [12]:
# Sensores unicos que reportaron datos
ids_tangaras = list(set(list(df_pm25_raw.columns) + list(df_temp_raw.columns) + list(df_hum_raw.columns)))
# Filtrar los sensores unicos de tangara que reportaron datos
df_tangaras = df_tangaras[df_tangaras['ID'].isin(ids_tangaras)]
print(len(df_tangaras['ID'].values))

26


In [13]:
# Determinar la zona en función de la latitud y longitud
def determinar_zona_geografica(lat: float, lon: float):
    # Definir el centro geográfico de la ciudad de Cali (latitud, longitud)
    centro_lat = 3.440000
    centro_lon = -76.519722
    # Determina la Zona Geografica
    if lat > centro_lat:
        zona_lat = 'NORTE'
    else:
        zona_lat = 'SUR'
    
    if lon > centro_lon:
        zona_lon = 'ORIENTE'
    else:
        zona_lon = 'OCCIDENTE'
    
    return f"{zona_lat}-{zona_lon}"

In [14]:
months = ['ENERO','FEBRERO','MARZO','ABRIL','MAYO','JUNIO','JULIO', 'AGOSTO', 'SEPTIEMBRE','OCTUBRE','NOVIEMBRE','DICIEMBRE']
holidays_colombia = holidays.Colombia(years=int(YEAR))

# Para cada uno de los sensores que registraron datos
for sensor_id in df_tangaras['ID'].unique():
    if sensor_id in df_pm25_raw.columns and sensor_id in df_temp_raw.columns and sensor_id in df_hum_raw.columns:
        temp_df = pd.DataFrame({
            'DATETIME': df_pm25_raw['DATETIME'],
            'SENSOR_ID': sensor_id,
            'PM25': df_pm25_raw[sensor_id],
            'TEMPERATURE': df_temp_raw[sensor_id],
            'HUMIDITY': df_hum_raw[sensor_id],
            'GEOLOCATION': df_tangaras.loc[df_tangaras['ID'] == sensor_id, 'GEOLOCATION'].values[0]
        })
        
        # Concatenar el DataFrame temporal en el DataFrame principal
        df_tangara_dataset = pd.concat([df_tangara_dataset, temp_df], ignore_index=True)

# Datos adicionales
df_tangara_dataset['YEAR'] = pd.to_datetime(df_tangara_dataset['DATETIME']).dt.year
df_tangara_dataset['MONTH'] = pd.to_datetime(df_tangara_dataset['DATETIME']).dt.month.apply(lambda x: months[x-1])
df_tangara_dataset['DAY_OF_WEEK'] = pd.to_datetime(df_tangara_dataset['DATETIME']).apply(lambda x: format_datetime(x, 'EEEE', locale='es').upper())
df_tangara_dataset['IS_WEEKEND'] = pd.to_datetime(df_tangara_dataset['DATETIME']).dt.weekday.apply(lambda x: True if x >= 5 else False)
df_tangara_dataset['IS_HOLIDAY'] = pd.to_datetime(df_tangara_dataset['DATETIME']).apply(lambda x: True if x in holidays_colombia else False)
df_tangara_dataset['CARDINAL_POINT'] = df_tangara_dataset['GEOLOCATION'].apply(lambda x: determinar_zona_geografica(float(x.split(' ')[0]),float(x.split(' ')[1])))


  df_tangara_dataset = pd.concat([df_tangara_dataset, temp_df], ignore_index=True)


In [15]:
# Resetea el index
df_tangara_dataset.reset_index(drop=True, inplace=True)

In [16]:
print(f"Size: {df_tangara_dataset.size}")
df_tangara_dataset.info()
df_tangara_dataset

Size: 27855360
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2321280 entries, 0 to 2321279
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   DATETIME        object 
 1   YEAR            int32  
 2   MONTH           object 
 3   DAY_OF_WEEK     object 
 4   IS_WEEKEND      bool   
 5   IS_HOLIDAY      bool   
 6   SENSOR_ID       object 
 7   PM25            float64
 8   TEMPERATURE     float64
 9   HUMIDITY        float64
 10  GEOLOCATION     object 
 11  CARDINAL_POINT  object 
dtypes: bool(2), float64(3), int32(1), object(6)
memory usage: 172.7+ MB


Unnamed: 0,DATETIME,YEAR,MONTH,DAY_OF_WEEK,IS_WEEKEND,IS_HOLIDAY,SENSOR_ID,PM25,TEMPERATURE,HUMIDITY,GEOLOCATION,CARDINAL_POINT
0,2024-08-01 00:00:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_B7BE,13.0,31.39,43.16,3.3982086181640625 -76.52595520019531,SUR-OCCIDENTE
1,2024-08-01 00:00:30-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_B7BE,13.0,31.38,43.30,3.3982086181640625 -76.52595520019531,SUR-OCCIDENTE
2,2024-08-01 00:01:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_B7BE,13.0,31.44,43.16,3.3982086181640625 -76.52595520019531,SUR-OCCIDENTE
3,2024-08-01 00:01:30-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_B7BE,13.0,31.45,42.99,3.3982086181640625 -76.52595520019531,SUR-OCCIDENTE
4,2024-08-01 00:02:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_B7BE,14.0,31.44,42.99,3.3982086181640625 -76.52595520019531,SUR-OCCIDENTE
...,...,...,...,...,...,...,...,...,...,...,...,...
2321275,2024-08-31 23:57:30-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,27.0,29.85,52.46,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321276,2024-08-31 23:58:00-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,28.0,29.86,52.47,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321277,2024-08-31 23:58:30-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,26.0,29.86,52.32,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321278,2024-08-31 23:59:00-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,,,,3.3734893798828125 -76.51634216308594,SUR-ORIENTE


In [17]:
#df_tangara_dataset[(df_tangara_dataset['IS_HOLIDAY']) & (df_tangara_dataset['DAY_OF_WEEK'] == 'LUNES')]
df_tangara_dataset[df_tangara_dataset['SENSOR_ID'] == 'TANGARA_2492']

Unnamed: 0,DATETIME,YEAR,MONTH,DAY_OF_WEEK,IS_WEEKEND,IS_HOLIDAY,SENSOR_ID,PM25,TEMPERATURE,HUMIDITY,GEOLOCATION,CARDINAL_POINT
2232000,2024-08-01 00:00:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_2492,,,,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2232001,2024-08-01 00:00:30-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_2492,15.0,31.24,46.48,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2232002,2024-08-01 00:01:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_2492,16.0,31.26,46.43,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2232003,2024-08-01 00:01:30-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_2492,15.0,31.26,46.59,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2232004,2024-08-01 00:02:00-05:00,2024,AGOSTO,JUEVES,False,False,TANGARA_2492,15.0,31.26,46.55,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
...,...,...,...,...,...,...,...,...,...,...,...,...
2321275,2024-08-31 23:57:30-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,27.0,29.85,52.46,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321276,2024-08-31 23:58:00-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,28.0,29.86,52.47,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321277,2024-08-31 23:58:30-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,26.0,29.86,52.32,3.3734893798828125 -76.51634216308594,SUR-ORIENTE
2321278,2024-08-31 23:59:00-05:00,2024,AGOSTO,SÁBADO,True,False,TANGARA_2492,,,,3.3734893798828125 -76.51634216308594,SUR-ORIENTE


In [18]:
file_name = to_file_name("tangara_dataset_raw", START_ISO8601_DATETIME, END_ISO8601_DATETIME)
df_to_csv(df_tangara_dataset, file_name)

[32m2024-09-20 18:28:12.578[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m448[0m - [34m[1mRun to_file_name:[0m
[32m2024-09-20 18:28:12.580[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mto_file_name[0m:[36m449[0m - [34m[1mFile Name: tangara_dataset_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
[32m2024-09-20 18:28:18.057[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_to_csv[0m:[36m228[0m - [34m[1mRun df_to_csv:[0m
[32m2024-09-20 18:28:18.058[0m | [34m[1mDEBUG   [0m | [36mutils[0m:[36mdf_to_csv[0m:[36m229[0m - [34m[1mSave DataFrame: /home/sebaxtian/Workspaces/Tangara/tangara-mlds-unal-2024/src/tangara/data/0_raw/tangara_dataset_raw_2024-08-01_00-00-00_UTC-0500_2024-08-31_23-59-59_UTC-0500.csv[0m
