In [1]:
import os
import json
import base64
import pathlib
import polars as pl
import boto3
import pyarrow
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
from functools import partial
#import mlflow

In [2]:
os.getcwd()

'/Users/samlafell/Desktop/mlops_zoomcamp_sam/07-project/notebooks'

In [3]:
class GetPaths():
    def __init__(self):
        self.project_root_path = pathlib.Path().absolute().parent
    
    def get_data_path(self):
        return self.project_root_path / 'data'
    
    def get_models_path(self):
        return self.project_root_path / 'models'
        
path = GetPaths()
data_path = path.get_data_path()
model_path = path.get_models_path()

In [4]:
os.listdir(data_path / 'raw')

['.gitkeep',
 'submission_format.csv',
 'dengue_labels_train.csv',
 'dengue_features_train.csv',
 'dengue_features_test.csv']

In [45]:
def import_data(df_path: pathlib.Path) -> pl.DataFrame():
    """Import Data

    Args:
        train_df (pathlib.Path): Give path of data

    Returns:
        df (polars.DataFrame): Return Polars Dataframe
    """
    return pl.read_csv(df_path)


def str_to_date(date_str, date_format = '%m/%d/%y'):
    """Convert String to a date

    Args:
        date_str (_type_): _description_

    Returns:
        _type_: _description_
    """
    return datetime.strptime(date_str, date_format).date()

In [59]:
training_features = import_data(df_path = data_path / 'raw' / 'dengue_features_train.csv')
training_labels = import_data(data_path / 'raw' / 'dengue_labels_train.csv')

In [60]:
iquitos_denguesite = import_data(data_path / 'raw' / 'iquitos_target.csv') \
    .select('season_week', 'week_start_date', 'total_cases') \
    .rename({'total_cases':'total_cases_v2'}) \
    .with_columns(pl.col("week_start_date").apply(str_to_date, return_dtype=pl.Date).alias("week")) \
    .with_columns(pl.lit('iq').alias('city')) \
    .drop('week_start_date')
    

four_digit_strtodate = partial(str_to_date, date_format = "%Y-%m-%d")
training_features = training_features \
    .with_columns(pl.col("week_start_date").apply(four_digit_strtodate, return_dtype=pl.Date).alias("week"))
    

season_week,total_cases_v2,week,city
i64,i64,date,str
1,0,2000-07-01,"""iq"""
2,0,2000-07-08,"""iq"""
3,0,2000-07-15,"""iq"""
4,0,2000-07-22,"""iq"""
5,0,2000-07-29,"""iq"""
6,0,2000-08-05,"""iq"""
7,0,2000-08-12,"""iq"""
8,0,2000-08-19,"""iq"""
9,0,2000-08-26,"""iq"""
10,0,2000-09-02,"""iq"""


In [62]:
from functools import partial
four_digit_strtodate = partial(str_to_date, date_format = "%Y-%m-%d")

training_features = training_features \
    .with_columns(pl.col("week_start_date").apply(four_digit_strtodate, return_dtype=pl.Date).alias("week"))
    
training_features.head(10)

city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,week
str,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,date
"""sj""",1990,18,"""1990-04-30""",0.1226,0.103725,0.1984833,0.1776167,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,1990-04-30
"""sj""",1990,19,"""1990-05-07""",0.1699,0.142175,0.1623571,0.1554857,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,1990-05-07
"""sj""",1990,20,"""1990-05-14""",0.03225,0.1729667,0.1572,0.1708429,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,1990-05-14
"""sj""",1990,21,"""1990-05-21""",0.1286333,0.2450667,0.2275571,0.2358857,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,1990-05-21
"""sj""",1990,22,"""1990-05-28""",0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,1990-05-28
"""sj""",1990,23,"""1990-06-04""",,0.17485,0.2543143,0.1817429,9.58,299.63,299.764286,295.851429,302.4,298.1,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1,1990-06-04
"""sj""",1990,24,"""1990-06-11""",0.1129,0.0928,0.2050714,0.2102714,3.48,299.207143,299.221429,295.865714,301.3,297.7,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7,1990-06-11
"""sj""",1990,25,"""1990-06-18""",0.0725,0.0725,0.1514714,0.1330286,151.12,299.591429,299.528571,296.531429,300.6,298.4,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1,1990-06-18
"""sj""",1990,26,"""1990-06-25""",0.10245,0.146175,0.1255714,0.1236,19.32,299.578571,299.557143,296.378571,302.1,297.7,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1,1990-06-25
"""sj""",1990,27,"""1990-07-02""",,0.12155,0.1606833,0.2025667,14.41,300.154286,300.278571,296.651429,302.3,298.7,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1,1990-07-02


In [7]:
print(f"San Juan, PR with {len(training_features.filter(pl.col('city') == 'sj'))} rows")
print(f"Iquitos, Peru with {len(training_features.filter(pl.col('city') == 'iq'))} rows")

San Juan, PR with 936 rows
Iquitos, Peru with 520 rows


In [8]:
training_labels.filter(pl.col('city') == 'sj')


city,year,weekofyear,total_cases
str,i64,i64,i64
"""sj""",1990,18,4
"""sj""",1990,19,5
"""sj""",1990,20,4
"""sj""",1990,21,3
"""sj""",1990,22,6
"""sj""",1990,23,2
"""sj""",1990,24,4
"""sj""",1990,25,5
"""sj""",1990,26,10
"""sj""",1990,27,6


In [18]:
# Import necessary modules
# Assume df is your DataFrame, which has been created using Polars.

# First, we need to convert the Polars DataFrame to Pandas DataFrame 
# as Plotly works with Pandas DataFrame
city = 'iq'
city_dengue = training_labels.filter(pl.col('city') == city)
df_pandas = city_dengue.to_pandas()

# Now, we can use Plotly Express to create the line graph
fig = px.line(df_pandas, x="weekofyear", y="total_cases", color='year', 
              title='Line Graph: Target Variable by Week of Year',
              labels={'week_of_year': 'Week of the Year', 
                      'total_cases': f'Dengue Cases By Year in {"San Juan, PR" if city == "sj" else "Iquitos, Perú"}'})

# Display the figure
fig.show()


In [23]:
# Assume df is your DataFrame, which has been created using Polars.

# Convert the Polars DataFrame to a Pandas DataFrame
city = 'iq'
city_dengue = training_labels.filter(pl.col('city') == city).unique()
df_pandas = city_dengue.to_pandas()

# Define a function to convert year and week of the year to a date
def weekyear_to_date(year, week):
    d = datetime(year, 1, 1)
    if d.weekday() <= 3:
        d = d - timedelta(d.weekday())
    else:
        d = d + timedelta(7 - d.weekday())
    d = d + timedelta(days=(week-1)*7)
    return d

# Create a new 'date' column using the function
df_pandas['date'] = df_pandas.apply(lambda row: weekyear_to_date(row['year'], row['weekofyear']), axis=1)
df_pandas

# Now, you can use Plotly Express to create the line graph
fig = px.line(df_pandas, x="date", y="total_cases", title='Line Graph: Target Variable by Date',
              labels={'date': 'Date', 
                      'total_cases': f'Dengue Cases By Year in {"San Juan, PR" if city == "sj" else "Iquitos, Perú"}'})

# Display the figure
fig.show()


Unnamed: 0,city,year,weekofyear,total_cases,date
0,iq,2000,49,0,2000-12-04
1,iq,2000,50,1,2000-12-11
2,iq,2001,2,0,2001-01-08
3,iq,2001,5,0,2001-01-29
4,iq,2001,6,0,2001-02-05
...,...,...,...,...,...
515,iq,2009,45,3,2009-11-02
516,iq,2009,46,9,2009-11-09
517,iq,2010,11,16,2010-03-15
518,iq,2010,15,6,2010-04-12


In [None]:
## I found the data on the Website, downloading the data