In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("bolivia_dataset.csv")  # Replace with actual raw dataset file path

# Data Cleaning
# Dropping irrelevant columns
drop_columns = ["casos acumulados", "incidencia_x_100milHab", "casos_estandarizados", "población", "department", "estacion_met"]
df_cleaned = df.drop(columns=drop_columns, errors='ignore')

# Dropping rows with missing values in crucial columns
df_cleaned = df_cleaned.dropna()

# Filling missing values in NDVI and NINO_3_4 with their respective column means
df_cleaned['NDVI'] = df_cleaned['NDVI'].fillna(df_cleaned['NDVI'].mean())
df_cleaned['NINO_3_4'] = df_cleaned['NINO_3_4'].fillna(df_cleaned['NINO_3_4'].mean())

# Adding lagged case columns to capture temporal patterns
for lag in range(1, 3):  # Adding 2 previous weeks' cases
    df_cleaned[f'cases_lag_{lag}'] = df_cleaned['cases'].shift(lag)

# Dropping rows with NaN values introduced by lagging
df_cleaned = df_cleaned.dropna().reset_index(drop=True)

# Save cleaned dataset
df_cleaned.to_csv("cleaned_dengue_dataset.csv", index=False)

# Defining features (X) and target variable (y)
X = df_cleaned.drop(columns=['cases'])  # Independent variables
y = df_cleaned['cases']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing and training the regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Making predictions
y_pred = model.predict(X_test_scaled)

# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Function to predict dengue cases for a given week's weather conditions
def predict_dengue_cases(input_data):
    input_scaled = scaler.transform([input_data])  # Scale input
    prediction = model.predict(input_scaled)[0]  # Predict cases
    return round(prediction, 2)

# Example usage:
# Provide sample input as [year, se, Precipitación, Temperatura_Máxima, Temperatura_Mínima, NDVI, NINO_3_4, cases_lag_1, cases_lag_2]
sample_input = [2025, 15, 10.5, 32.0, 22.5, 0.72, 26.0, 400, 380]  # Including previous cases
print(f"Predicted dengue cases: {predict_dengue_cases(sample_input)}")


Mean Absolute Error (MAE): 19.779769708066027
Mean Squared Error (MSE): 1793.995536499896
Root Mean Squared Error (RMSE): 42.35558447831757
R-squared (R2): 0.9496025594116615
Predicted dengue cases: 407.45


