# Importar librerías necesarias

In [24]:
# Librerías generales
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# Statsmodels y SciPy
from scipy.stats import pearsonr

# Scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (mean_squared_error, r2_score,accuracy_score, precision_score, recall_score, f1_score)

# Imbalanced-learn
from imblearn.over_sampling import SMOTE

# Tensorflow
from tensorflow import keras

# Scikit-learn y Tensorflow (repetidos, puedes eliminar uno de cada par)
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPRegressor

# Funciones generales
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score)

# Guardar y cargar modelos
import joblib

# Importar y procesar datos para el entrenamiento y prueba de los modelos

## Importar el dataset

In [25]:
weatherAUS_df = pd.read_csv('weatherAUS.csv')

## Procesar los datos


In [26]:
# Función de procesamiento de datos
def process_data(df):

    # Eliminar columnas innecesarias ----------------------------------------------------------------------------------------------------------------
    # Eliminar la columna 'Unnamed: 0'
    df = df.drop(columns=['Unnamed: 0'])

    # Eliminar registros con datos nulos en las variables respuesta 'RainTomorrow' y 'RainfallTomorrow'
    df.dropna(subset=['RainTomorrow', 'RainfallTomorrow'], inplace=True)

    # Lista de ubicaciones de interés
    ubicaciones_interes = ["Sydney", "SydneyAirport", "Canberra", "Melbourne", "MelbourneAirport"]

    # Filtrar las filas en base a las ubicaciones interes
    df = df[df['Location'].isin(ubicaciones_interes)]

    # Eliminar direcciones del viento
    columns_to_exclude = [col for col in df.columns if "Dir" in col]
    df.drop(columns=columns_to_exclude, inplace = True)

    # Utilizar replace para cambiar el valor 9 por NaN en 'Cloud9am' --------------------------------------------------------------------------------
    df['Cloud9am'] = df['Cloud9am'].replace(9, np.nan)

    # Codificar la fecha y otras variables categóricas ---------------------------------------------------------------------------------------------
    # Crear una variable wet_month para saber si la fecha corresponde a un mes lluvioso o no
    df['Date'] = pd.to_datetime(df['Date'])
    df['wet_month'] = df['Date'].apply(lambda x: 1.0 if 5 <= x.month <= 10 else 0.0)
    df.drop(columns=['Date'], inplace=True)

    # Codificar las variables categóricas de si llovió en la fecha y la localización
    columns_to_encode = ['Location', 'RainToday', 'RainTomorrow']
    df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
    new_columns_names = ['Location_Melbourne',	'Location_MelbourneAirport',	'Location_Sydney',	'Location_SydneyAirport', 'RainToday_Yes',	'RainTomorrow_Yes'] 
    df[new_columns_names] = df[new_columns_names].astype(float)

    return df

In [27]:
x_train_processed = process_data(weatherAUS_df)

In [28]:
x_train_processed

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Temp9am,Temp3pm,RainfallTomorrow,wet_month,Location_Melbourne,Location_MelbourneAirport,Location_Sydney,Location_SydneyAirport,RainToday_Yes,RainTomorrow_Yes
30167,19.5,22.4,15.6,6.2,0.0,,17.0,20.0,92.0,84.0,...,20.7,20.9,6.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
30168,19.5,25.6,6.0,3.4,2.7,,9.0,13.0,83.0,73.0,...,22.4,24.8,6.6,0.0,0.0,0.0,1.0,0.0,1.0,1.0
30169,21.6,24.5,6.6,2.4,0.1,,17.0,2.0,88.0,86.0,...,23.5,23.0,18.8,0.0,0.0,0.0,1.0,0.0,1.0,1.0
30170,20.2,22.8,18.8,2.2,0.0,,22.0,20.0,83.0,90.0,...,21.4,20.9,77.4,0.0,0.0,0.0,1.0,0.0,1.0,1.0
30171,19.7,25.7,77.4,,0.0,,11.0,6.0,88.0,74.0,...,22.5,25.5,1.6,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70366,6.4,15.8,0.0,2.4,8.8,39.0,11.0,17.0,84.0,56.0,...,7.9,15.6,0.6,1.0,1.0,0.0,0.0,0.0,0.0,0.0
70367,7.8,13.5,0.6,0.6,0.0,24.0,2.0,9.0,100.0,80.0,...,11.2,12.7,0.2,1.0,1.0,0.0,0.0,0.0,0.0,0.0
70368,6.7,14.1,0.2,0.6,0.0,24.0,9.0,11.0,93.0,56.0,...,7.5,13.5,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
70369,7.4,14.9,0.0,1.6,6.0,44.0,17.0,19.0,77.0,63.0,...,9.1,12.5,1.4,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [29]:
x_train_processed.iloc[[2]]

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Temp9am,Temp3pm,RainfallTomorrow,wet_month,Location_Melbourne,Location_MelbourneAirport,Location_Sydney,Location_SydneyAirport,RainToday_Yes,RainTomorrow_Yes
30169,21.6,24.5,6.6,2.4,0.1,,17.0,2.0,88.0,86.0,...,23.5,23.0,18.8,0.0,0.0,0.0,1.0,0.0,1.0,1.0


## Separar conjuntos de entrenamiento y de prueba

In [30]:
def split_datasets(df):
    # Definir las columnas explicativas (X) y las variables de respuesta (y) para regresión y clasificación
    features = df.drop(columns=['RainfallTomorrow', 'RainTomorrow_Yes'])
    target_reg = df['RainfallTomorrow']
    target_class = df['RainTomorrow_Yes']

    # Separar el conjunto de entrenamiento y prueba
    x_train, x_test, y_train_reg, y_test_reg, y_train_class, y_test_class = train_test_split(
        features, target_reg, target_class, test_size=0.2, random_state=7
    )

    return x_train, x_test, y_train_reg, y_test_reg, y_train_class, y_test_class

In [31]:
# Uso de la función con tu DataFrame
x_train, x_test, y_train_reg, y_test_reg, y_train_class, y_test_class = split_datasets(x_train_processed)

In [32]:
x_test.iloc[[2]]

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Cloud9am,Cloud3pm,Temp9am,Temp3pm,wet_month,Location_Melbourne,Location_MelbourneAirport,Location_Sydney,Location_SydneyAirport,RainToday_Yes
36093,13.9,23.2,0.0,2.8,9.6,31.0,9.0,22.0,68.0,59.0,...,3.0,3.0,20.1,22.1,0.0,0.0,0.0,0.0,1.0,0.0


# Procesamiento de los datos

In [33]:
class DataProcessor(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.scaler = StandardScaler()
        self.knn_imputer = KNNImputer(n_neighbors=5)

    def impute_RainToday(self, df):
        # Imputar datos de RainToday
        df['RainToday_Yes'] = df.apply(lambda row: 1 if row['Rainfall'] >= 1.2 else 0, axis=1)
        return df

    def fit(self, X, y=None):
        # Obtener columnas númericas
        columnas_numericas = list(X.select_dtypes(include=['float64']).columns)

        # Ajustar el imputador K-NN
        self.knn_imputer.fit(X[columnas_numericas])

        # Ajustar el escalador
        self.scaler.fit(X[columnas_numericas])

        return self

    def transform(self, X):
        # Obtener columnas númericas
        columnas_numericas = list(X.select_dtypes(include=['float64']).columns)

        # Aplica el imputador a las columnas seleccionadas
        X[columnas_numericas] = self.knn_imputer.transform(X[columnas_numericas])

        # Llamar a la función impute_RainToday
        X = self.impute_RainToday(X)

        # Escalar características
        X[columnas_numericas] = self.scaler.transform(X[columnas_numericas])

        return X

# Modelos

## Regresión

In [34]:
class NeuralNetworkPipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_layers = 1
        self.num_neurons = 32
        self.epochs = 70
        self.batch_size = 32
        self.model = None
        self.scaler = None
        self.knn_imputer = None
        self.best_params = None

    def fit(self, X, y):
        # Construir y compilar el modelo de la red neuronal
        self.model = self.build_model(X, self.num_layers, self.num_neurons)

        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        # Realizar predicciones
        predictions = self.model.predict(X)
        return predictions

    def score(self, X, y, metric='r2'):
        # Calcular la métrica especificada (predicciones vs. valores reales)
        predictions = self.predict(X)

        if metric == 'r2':
            score = r2_score(y, predictions)
        elif metric == 'rmse':
            mse = mean_squared_error(y, predictions)
            score = np.sqrt(mse)
        else:
            raise ValueError("Métrica no válida. Use 'r2' o 'rmse'.")
        return score

    def build_model(self, X, num_layers, num_neurons):
        model = keras.Sequential()
        model.add(keras.layers.Dense(num_neurons, activation='relu', input_shape=(X.shape[1],)))

        for _ in range(num_layers - 1):
            model.add(keras.layers.Dense(num_neurons, activation='relu'))

        model.add(keras.layers.Dense(1, activation='linear'))
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

## Clasificación

In [35]:
class ClassificationPipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.smote_params = {'sampling_strategy': 'auto', 'random_state': 42}
        self.lr_params = {'solver': 'liblinear', 'max_iter': 200, 'class_weight': 'balanced', 'C': 0.007}
        self.smote = None
        self.lr_model = None
        self.y_test_pred = None

    def fit(self, X, y):
        # Aplicar SMOTE al conjunto de entrenamiento
        if self.smote_params:
            self.smote = SMOTE(**self.smote_params)
            X_resampled, y_resampled = self.smote.fit_resample(X, y)
        else:
            X_resampled, y_resampled = X, y

        # Entrenar el modelo de regresión logística
        if self.lr_params:
            self.lr_model = LogisticRegression(**self.lr_params)
            self.lr_model.fit(X_resampled, y_resampled)
        else:
            raise ValueError("Se requieren parámetros para el modelo de regresión logística.")

        return self

    def transform(self, X):
        # No es necesario realizar transformaciones específicas
        return X

    def predict(self, X):
        # Predecir utilizando el modelo de regresión logística entrenado
        if self.lr_model:
            self.y_test_pred = self.lr_model.predict(X)
            return self.y_test_pred
        else:
            raise ValueError("El modelo de regresión logística no ha sido entrenado.")

    def calculate_metrics(self, y_true=None):
        if y_true is None:
            raise ValueError("Se requiere el conjunto de etiquetas verdaderas (y_true) para calcular las métricas.")

        if self.y_test_pred is None:
            raise ValueError("Primero debes realizar predicciones en los datos de prueba.")

        accuracy = accuracy_score(y_true, self.y_test_pred)
        precision = precision_score(y_true, self.y_test_pred)
        recall = recall_score(y_true, self.y_test_pred)
        f1 = f1_score(y_true, self.y_test_pred)

        metrics = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }

        return metrics


# Pipelines

## Regresión

### Crear el pipeline

In [36]:
# Crear instancias de DataProcessor y NeuralNetworkPipeline
data_processor = DataProcessor()
neural_network_pipeline = NeuralNetworkPipeline()

# Crear el pipeline
regression_pipeline = Pipeline([
    ('data_processor', data_processor),
    ('neural_network', neural_network_pipeline)
])
regression_pipeline

### Entrenar el pipeline y obtener métricas

In [37]:
x_train.Location_SydneyAirport

68719    0.0
48341    0.0
34739    1.0
46217    0.0
47306    0.0
        ... 
35873    1.0
65156    0.0
30704    0.0
48661    0.0
66877    0.0
Name: Location_SydneyAirport, Length: 12159, dtype: float64

In [38]:
# Entrenar el pipeline con los datos de entrenamiento
regression_pipeline.fit(x_train, y_train_reg)

# Hacer predicciones en los datos de prueba
predictions = regression_pipeline.predict(x_test)

# Calcular la métrica R^2 en los datos de prueba
r2 = regression_pipeline[1].score(x_test, y_test_reg, metric='r2')

# Calcular la métrica RMSE en los datos de prueba
rmse = regression_pipeline[1].score(x_test, y_test_reg, metric='rmse')

# Imprimir resultados
print('\n-------------------------------------------------------------')
print(f'R^2 en datos de prueba: {r2}')
print(f'RMSE en datos de prueba: {rmse}')


-------------------------------------------------------------
R^2 en datos de prueba: 0.33146018241800823
RMSE en datos de prueba: 6.07515316017315


In [39]:
# Crear instancias de las clases
data_processor = DataProcessor()
classification_pipeline = ClassificationPipeline()

# Crear el pipeline completo
classification_pipeline = Pipeline([
    ('data_processor', data_processor),
    ('classification_pipeline', classification_pipeline)
])

classification_pipeline

In [40]:
x_test['RainToday_Yes']

70162   -0.549762
67360   -0.549762
36093   -0.549762
64411   -0.549762
64281   -0.549762
           ...   
47292   -0.549762
64724   -0.549762
48840   -0.549762
66278   -0.549762
68426   -0.549762
Name: RainToday_Yes, Length: 3040, dtype: float64

In [41]:
# Entrenar el pipeline con los datos de entrenamiento
classification_pipeline.fit(x_train, y_train_class)

# Hacer predicciones en los datos de prueba
y_pred_class = classification_pipeline.predict(x_test)

# Calcular y mostrar las métricas
metrics_class = classification_pipeline[1].calculate_metrics(y_test_class)
print("Métricas de clasificación:")
print(metrics_class)

Métricas de clasificación:
{'Accuracy': 0.7743421052631579, 'Precision': 0.5089285714285714, 'Recall': 0.7286931818181818, 'F1 Score': 0.5992990654205607}


# Desplegando los Pipelines

In [42]:
# Usar joblib para exportar los pipelines
joblib.dump(regression_pipeline, 'rain_prediction_regression.pkl')
joblib.dump(classification_pipeline, 'rain_prediction_classification.pkl')

['rain_prediction_classification.pkl']