In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# === Load Processed Data from DVC-tracked paths ===
train_data = pd.read_csv("data/processed/train.csv")
test_data = pd.read_csv("data/processed/test.csv")

# === Define target column ===
target_column = "median_house_value"  # Ensure this matches your processed CSVs

# === Split features and target ===
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

# === MLflow Configuration ===
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Housing-LinearRegression")

# === Start MLflow Run ===
with mlflow.start_run():
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log params and metrics
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)

    # Log model
    mlflow.sklearn.log_model(model, artifact_path="model")

    print(f"✅ Model logged to MLflow with R2 score: {r2:.4f}")
