In [None]:
# !pip install wandb==0.21.1 scikit-learn==1.6.1

In [None]:
!wandb login

### DATA

In [None]:
import wandb
import pandas as pd
from sklearn.datasets import fetch_california_housing

def upload_raw_dataset():
    """
    Fetches the California Housing dataset, saves it as a CSV,
    and logs it as a W&B Artifact.
    """
    # 1. Initialize a W&B run
    with wandb.init \
     (project="house-price-prediction", job_type="upload-dataset") as run:

        # 2. Fetch data
        housing = fetch_california_housing(as_frame=True)
        df = housing.frame

        # 3. Save data to a local file
        filename = "california_housing_raw.csv"
        df.to_csv(filename, index=False)

        # 4. Create a W&B Artifact
        # We use a descriptive name and the type 'dataset'.
        raw_data_artifact = wandb.Artifact(
            "california-housing-raw",
            type="dataset",
            description="Raw California Housing dataset from scikit-learn.",
            metadata={"source": "scikit-learn.fetch_california_housing"}
        )

        # 5. Add the file to the artifact
        raw_data_artifact.add_file(filename)

        # 6. Log the artifact to W&B
        # This uploads the file and creates the first version, 'v0'.
        run.log_artifact(raw_data_artifact)
        print("Successfully logged raw data artifact.")

upload_raw_dataset()

### TRAIN AND TRACK

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import wandb.sklearn
import joblib

def train_regressor():
    """
    Trains a RandomForestRegressor, logging results and plots to W&B.
    """
    # Default hyperparameters
    def_config = {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_leaf": 2,
        "random_state": 42
    }

    # 1. Initialize a W&B run
    with wandb.init \
     (project="house-price-prediction", job_type="training", config=def_config,
      tags=["random-forest", "regression", "housing"]) as run:

        config = wandb.config

        # 2. Use the dataset artifact
        # This will download the artifact and create a lineage link.
        artifact = run.use_artifact('california-housing-raw:latest')
        artifact_dir = artifact.download()
        print(artifact_dir)
        data_path = f"{artifact_dir}/california_housing_raw.csv"
        df = pd.read_csv(data_path)
        print(df.head())

        # 3. Prepare data
        X = df.drop('MedHouseVal', axis=1)
        y = df['MedHouseVal']
        X_train, X_test, y_train, y_test = train_test_split(
                                            X, y, test_size=0.2,
                                            random_state=config.random_state)

        # 4. Train the model
        model = RandomForestRegressor(
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            min_samples_leaf=config.min_samples_leaf,
            random_state=config.random_state
        )
        model.fit(X_train, y_train)

        # 5. Evaluate and log metrics
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        wandb.log({
            "mse": mse,
            "r2": r2
        })

        print(f"MSE: {mse:.4f}, R2: {r2:.4f}")

        # 6. Log scikit-learn plots
        # This is a convenience function that generates and logs multiple plots.
        plots = wandb.sklearn.plot_regressor(
            model,
            X_train,
            X_test,
            y_train,
            y_test,
            model_name="RandomForestRegressor"
        )

        wandb.log({"regressor_plots": plots})

        # 7. Save the model file
        model_filename = "rf_regressor.joblib"
        joblib.dump(model, model_filename)

        # 8. Create a model artifact
        model_artifact = wandb.Artifact(
            "house-price-rf-regressor",
            type="model",
            description="A trained RandomForestRegressor for housing data.",
            metadata=dict(run.config) # Link hyperparameters
        )

        # 9. Add the model file to the artifact
        model_artifact.add_file(model_filename)

        # 10. Log the model artifact
        run.log_artifact(model_artifact)

        print("Successfully logged model artifact.")

train_regressor()

### PULL REGISTERED MODEL

In [None]:
import wandb
import joblib

run = wandb.init(project="house-price-prediction", job_type="inference")

# Load artifact from model registry
artifact = run.use_artifact("wandb-registry-registry-house-price/Registered-regressors:staging")
artifact_dir = artifact.download()

# Load the model
model = joblib.load(f"{artifact_dir}/rf_regressor.joblib")

In [None]:
# TO Be COntinued