In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
import pandas as pd


class ModelPipeline:
    def __init__(self, target_column):
        self.target_column = target_column
        self.preprocessor = None
        self.pipeline = None

    def separate_features_target(self, df):
        X = df.drop(self.target_column, axis=1)
        y = df[self.target_column]
        return X, y

    def identify_features(self, X):
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns
        return numerical_features, categorical_features

    def create_preprocessor(self, numerical_features, categorical_features):
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
    def create_pipeline(self, classifier):
        self.pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('classifier', classifier)
        ])

    def train_pipeline(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)
        return self.pipeline

    def evaluate_pipeline(self, X_test, y_test):
        accuracy = self.pipeline.score(X_test, y_test)
        print(f'Model Accuracy: {accuracy}')
        return accuracy


In [4]:
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score

class ModelEvaluator:
    def __init__(self, pipeline):
        self.pipeline = pipeline

    def evaluate_classification_model(self, X_test, y_test):
        y_pred = self.pipeline.predict(X_test)
        accuracy = self.pipeline.score(X_test, y_test)
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r_squared = r2_score(y_test, y_pred)
        return accuracy, report, conf_matrix, mae, mse, r_squared

    def evaluate_regression_model(self, X_test, y_test):
        y_pred = self.pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r_squared = r2_score(y_test, y_pred)
        return mae, mse, r_squared


In [8]:
import mlflow
import mlflow.sklearn

class MLflowTracker:
    def __init__(self, tracking_uri):
        self.tracking_uri = tracking_uri

    def start_run(self, experiment_name=None, run_name=None):
        mlflow.set_tracking_uri(self.tracking_uri)
        mlflow.set_experiment(experiment_name)
        mlflow.start_run(run_name=run_name)

    def log_model(self, model, model_name):
        mlflow.sklearn.log_model(model, model_name)

    def log_metrics(self, metrics):
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)

    def log_params(self, params):
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

    def log_artifact(self, local_path, artifact_path=None):
        mlflow.log_artifact(local_path, artifact_path)

    def end_run(self):
        mlflow.end_run()

ModuleNotFoundError: No module named 'mlflow'

In [7]:
data = pd.read_csv('./processed_data.csv')

# Initialize ModelPipeline object
pipeline = ModelPipeline(target_column='Price')

# Separate features and target variable
X, y = pipeline.separate_features_target(data)

# Identify numerical and categorical features
numerical_features, categorical_features = pipeline.identify_features(X)


# Create preprocessor
pipeline.create_preprocessor(numerical_features, categorical_features)

# Define your regression model (for example, RandomForestRegressor)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()

# Create pipeline with preprocessor and regressor
pipeline.create_pipeline(regressor)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
model = pipeline.train_pipeline(X_train, y_train)

# Initialize ModelEvaluator object with the trained pipeline for regression
evaluator_regression = ModelEvaluator(model)

# Evaluate the regression model
mae_regression, mse_regression, r_squared_regression = evaluator_regression.evaluate_regression_model(X_test, y_test)

# Print the evaluation metrics for regression
print("Mean Absolute Error (Regression):", mae_regression)
print("Mean Squared Error (Regression):", mse_regression)
print("R-squared (Regression):", r_squared_regression)

Mean Absolute Error (Regression): 752.0131944444444
Mean Squared Error (Regression): 975027.9077916667
R-squared (Regression): 0.9269247292141694


In [None]:

# Load your dataset
data = pd.read_csv('./processed_data.csv')

# Initialize ModelPipeline object
pipeline = ModelPipeline(target_column='Price')

# Separate features and target variable
X, y = pipeline.separate_features_target(data)

# Identify numerical and categorical features
numerical_features, categorical_features = pipeline.identify_features(X)

# Create preprocessor
pipeline.create_preprocessor(numerical_features, categorical_features)

# Define your regression model (for example, RandomForestRegressor)
regressor = RandomForestRegressor()

# Create pipeline with preprocessor and regressor
pipeline.create_pipeline(regressor)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.train_pipeline(X_train, y_train)

# Evaluate the regression model
evaluator = ModelEvaluator(pipeline)
mae_regression, mse_regression, r_squared_regression = evaluator.evaluate_regression_model(X_test, y_test)

tracker = MLflowTracker(tracking_uri="your_mlflow_tracking_uri")
# Initialize MLflowTracker with the tracking URI
tracker.start_run(experiment_name="your_experiment_name", run_name="your_run_name")

# Start MLflow run
tracker.start_run()

# Log the trained model
tracker.log_model(pipeline.pipeline, "random_forest_model")

# Log evaluation metrics dynamically
evaluation_metrics = {
    "mae": mae_regression,
    "mse": mse_regression,
    "r_squared": r_squared_regression
}
tracker.log_metrics(evaluation_metrics)

# End MLflow run
tracker.end_run()

# Print the evaluation metrics for regression
print("Mean Absolute Error (Regression):", mae_regression)
print("Mean Squared Error (Regression):", mse_regression)
print("R-squared (Regression):", r_squared_regression)