# Steps

* Map 'RainTomorrow' and 'RainToday'
* Remove outliers
* Handle missing data
* Drop unnecessary features
* Standarize
* Encode

In [None]:
import pandas as pd
import numpy as np
import yaml
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import classification_report
import xgboost as xgb
import os
import joblib

ImportError: dlopen(/Users/robertogarces/miniforge3/envs/datascience/lib/python3.9/site-packages/scipy/special/_ufuncs.cpython-39-darwin.so, 0x0002): symbol not found in flat namespace '_npy_asinh'

In [2]:
from sklearn.model_selection import train_test_split

def split_data_and_save(data_path, test_size):
    """
    Splits a DataFrame into training and testing sets.

    Args:
        df (pd.DataFrame): The DataFrame to split.
        test_size (float): Proportion of the dataset to include in the test split (between 0 and 1).

    Returns:
        tuple: (train_df, test_df) Training and testing DataFrames.
    """
    df = pd.read_csv(f"{data_path}/weatherAUS.csv")
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
    train_df.to_csv(f"{data_path}/train_weatherAUS.csv", index=False)
    test_df.to_csv(f"{data_path}/test_weatherAUS.csv", index=False)


def load_features(features_path: str):
    """Cargar archivo YAML de configuración"""
    with open(features_path, 'r') as file:
        return yaml.safe_load(file)

def get_data(data_path: str) -> pd.DataFrame:
    return(pd.read_csv(data_path))

def load_config(config_path: str):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

ImportError: dlopen(/Users/robertogarces/miniforge3/envs/datascience/lib/python3.9/site-packages/scipy/special/_ufuncs.cpython-39-darwin.so, 0x0002): symbol not found in flat namespace '_npy_asinh'

# now with classes

In [None]:
# 1. Missing value handler
class MissingValueHandler(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.2):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.columns_to_drop_ = X.isnull().mean()[lambda x: x > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.drop(columns=self.columns_to_drop_)
        return X.dropna()

# 2. Binary mapper
class BinaryMapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, mapping={'No': 0, 'Yes': 1}):
        self.columns = columns
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns or []:
            if col in X.columns:
                X[col] = X[col].map(self.mapping)
        return X

# 3. Outlier remover
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, features=None, quantile=0.99):
        self.features = features
        self.quantile = quantile

    def fit(self, X, y=None):
        self.thresholds_ = {
            feature: X[feature].quantile(self.quantile)
            for feature in self.features or [] if feature in X.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for feature, threshold in self.thresholds_.items():
            X = X[(X[feature] < threshold) | X[feature].isnull()]
        return X

# 4. Numerical scaler
class NumericalScaler(BaseEstimator, TransformerMixin):
    def __init__(self, exclude=None):
        self.exclude = exclude
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.numerical_features_ = X.select_dtypes(include='number').columns.tolist()
        if self.exclude in self.numerical_features_:
            self.numerical_features_.remove(self.exclude)
        self.scaler.fit(X[self.numerical_features_])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.numerical_features_] = self.scaler.transform(X[self.numerical_features_])
        return X

# 5. Categorical encoder
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.categorical_features_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
        self.encoders_ = {
            col: LabelEncoder().fit(X[col].astype(str))
            for col in self.categorical_features_
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col, encoder in self.encoders_.items():
            X[col] = encoder.transform(X[col].astype(str))
        return X


In [None]:
def preprocessing(config):
    """
    Executes the data preprocessing pipeline and returns X, y, and the fitted pipeline.

    Parameters:
        config (dict): Configuration dictionary with paths and preprocessing parameters.

    Returns:
        X (pd.DataFrame): Processed features.
        y (pd.Series): Target variable.
        preprocessing_pipeline (Pipeline): Fitted preprocessing pipeline.
    """

    split_data_and_save(config['raw_data_path'], test_size=0.2)

    # Load data
    df = get_data(f"{config['raw_data_path']}/train_weatherAUS.csv")

    # Drop unnecessary features
    df = df.drop(columns=config.get('features_to_drop', []))

    # Define the preprocessing pipeline
    preprocessing_pipeline = Pipeline(steps=[
        ('missing', MissingValueHandler(threshold=config.get('missing_data_threshold', 0.2))),
        ('binary', BinaryMapper(columns=config.get('features_to_map', []))),
        ('outliers', OutlierRemover(features=config.get('features_with_outliers', []))),
        ('scaling', NumericalScaler(exclude=config['target'])),
        ('encoding', CategoricalEncoder()),
    ])

    # Fit and transform the pipeline
    df_processed = preprocessing_pipeline.fit_transform(df)

    joblib.dump(preprocessing_pipeline, f"{config['artifacts_path']}/preprocessing_pipeline.joblib")

    # Separate features and target
    X = df_processed.drop(columns=[config['target']])
    y = df_processed[config['target']]

    X.to_csv(f"{config['processed_data_path']}/X_train.csv", index=False)
    y.to_csv(f"{config['processed_data_path']}/y_train.csv", index=False)


In [None]:
config = load_config('../config/config.yaml')
preprocessing(config)

# Training Pipeline 

In [2]:
X = get_data(f"{config['processed_data_path']}/X_train.csv")
y = get_data(f"{config['processed_data_path']}/y_train.csv")

NameError: name 'get_data' is not defined

In [3]:
def train_evaluate_save_model(X, y, model_path, test_size=0.2):
    """
    Splits the data, trains an XGBoost classifier, evaluates it, and saves the model.

    Args:
        X (pd.DataFrame or np.ndarray): Feature matrix.
        y (pd.Series or np.ndarray): Target vector.
        model_path (str): Path to save the trained model (e.g., 'models/xgb_model.joblib').
        test_size (float): Proportion of the dataset to include in the test split.

    Returns:
        model: The trained XGBoost model.
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Initialize and train the model
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    print("Classification Report:\n")
    print(classification_report(y_test, y_pred))

    # Ensure the directory exists and save the model
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model, model_path)

    return model


In [4]:
model = train_evaluate_save_model(X, y, model_path=f'{config["models_path"]}/model.joblib', test_size=0.2)

NameError: name 'X' is not defined

# Predict

In [27]:
test_df = get_data(f"{config['raw_data_path']}/test_weatherAUS.csv")
test_df.drop(columns=config.get('features_to_drop', []), inplace=True)

pipeline = joblib.load(f"{config['artifacts_path']}/preprocessing_pipeline.joblib")
test_df = pipeline.transform(test_df)

In [29]:
#X = get_data(f"{config['processed_data_path']}/X_train.csv")
#y = get_data(f"{config['processed_data_path']}/y_train.csv")
X = test_df.drop(config['target'], axis=1)
y = test_df[config['target']]

In [31]:
test_preds = model.predict(X)
print("Classification Report:\n")
print(classification_report(y, test_preds))

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.95      0.91     17613
           1       0.75      0.56      0.64      5026

    accuracy                           0.86     22639
   macro avg       0.81      0.75      0.78     22639
weighted avg       0.85      0.86      0.85     22639

