In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer            
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Fetch and load dataset
import tarfile
import urllib

In [4]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [5]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [6]:
fetch_housing_data()
housing = load_housing_data()

# Display the first few rows of the dataset
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
# Separate features and target
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

In [8]:
# Numerical and categorical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=[object]).columns.tolist()

In [9]:
# Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [11]:
# Define the pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# MLFlow experiment
mlflow.set_experiment("housing-price-prediction")

2024/12/23 13:38:49 INFO mlflow.tracking.fluent: Experiment with name 'housing-price-prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/saachi/MLE/mlflow-handson-mle-assignment-3.2/notebooks/mlruns/847763988385398937', creation_time=1734941329117, experiment_id='847763988385398937', last_update_time=1734941329117, lifecycle_stage='active', name='housing-price-prediction', tags={}>

In [21]:
with mlflow.start_run():
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model_pipeline, "model")



In [25]:
from mlflow.tracking import MlflowClient

# Initialize the client
client = MlflowClient()

# Use search_experiments to list all experiments
experiments = client.search_experiments()

# Check if there are experiments
if not experiments:
    print("No experiments found.")
else:
    # Iterate over experiments and print their ID and name
    for exp in experiments:
        print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")
    
    # Fetch runs for the first experiment
    experiment_id = experiments[0].experiment_id
    runs = client.search_runs(experiment_ids=[experiment_id])

    # Check if there are any runs
    if not runs:
        print(f"No runs found for experiment ID {experiment_id}.")
    else:
        # Iterate over the runs and print their parameters and metrics
        for run in runs:
            print(f"Run ID: {run.info.run_id}")
            print(f"Parameters: {run.data.params}")
            print(f"Metrics: {run.data.metrics}")


AttributeError: 'MlflowClient' object has no attribute 'list_experiments'

In [24]:
!pip install --upgrade mlflow



In [16]:
print(mlflow.get_tracking_uri())

file:///home/saachi/MLE/mlflow-handson-mle-assignment-3.2/notebooks/mlruns


In [20]:
import mlflow
print(mlflow.__version__)

2.17.2
