In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Reading The Cleaned Dataset

In [35]:
df = pd.read_csv("data/boston_house_prices.csv")
dataset=df[df.columns[:-1]]
dataset['Price'] = df[df.columns[-1]]

In [36]:
## Independent and Dependent features
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

In [37]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [38]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

## Model Training with MLFlow Logging

In [39]:

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import mlflow

Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression
with mlflow.start_run(run_name="simple_linear_regression"):
    regression=LinearRegression()
    regression.fit(X_train,y_train)
    ## on which parameters the model has been trained
    regression.get_params()
    
    ### Prediction With Test Data
    reg_pred=regression.predict(X_test)
    
    ## Residuals
    residuals=y_test-reg_pred

    mae = mean_absolute_error(y_test,reg_pred)
    mse = mean_squared_error(y_test,reg_pred)
    rmse = np.sqrt(mean_squared_error(y_test,reg_pred))

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(regression, "simple_linear_regression_model")




Neural Network

In [41]:
import mlflow
import mlflow.keras
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Define the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)  # Assuming a regression problem with a single output
])

# Compile the model
model.compile(optimizer=Adam(), loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])

# Start an MLflow run
with mlflow.start_run(run_name="simple_neural_network"):
    # Train the model
    history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    
    # Log the model
    mlflow.keras.log_model(model, "simple_neural_network_model")
    
    print(f"Model logged with MSE: {mse} and MAE: {mae}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 578.2606 - mean_absolute_error: 22.4383 - val_loss: 582.5418 - val_mean_absolute_error: 22.6317
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 587.1130 - mean_absolute_error: 22.1672 - val_loss: 546.2503 - val_mean_absolute_error: 21.8230
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 531.3760 - mean_absolute_error: 21.1355 - val_loss: 502.9244 - val_mean_absolute_error: 20.8293
Epoch 4/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 493.9694 - mean_absolute_error: 20.0763 - val_loss: 448.6924 - val_mean_absolute_error: 19.5363
Epoch 5/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 472.7986 - mean_absolute_error: 19.1763 - val_loss: 381.7090 - val_mean_absolute_error: 17.8324
Epoch 6/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/ste



Model logged with MSE: 14.017704827660953 and MAE: 2.3253118253734013


XGBoost

In [46]:
import xgboost as xgb

# Start an MLflow run
with mlflow.start_run(run_name="xgboost_regression"):
    
    # Define the model
    model = xgb.XGBRegressor(
        objective='reg:squarederror',  # For regression tasks
        n_estimators=100,              # Number of boosting rounds
        learning_rate=0.1,             # Step size shrinkage
        max_depth=5                    # Maximum depth of trees
    )

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)

    # Log the model
    mlflow.xgboost.log_model(model, "xgboost_model")

    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")



Mean Squared Error: 8.914547661950342
Mean Absolute Error: 1.8985972535120297


## Hyperparameter Tuning with GridSearchCV

In [42]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
import json

Linear Regression with Hyperparameter Tuning and MLFlow Logging

In [43]:
# Define the model
model = Ridge()

# Define the parameter grid
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

# Save the parameter grid to a JSON file
param_grid_filename = 'regression_param_grid.json'
with open(param_grid_filename, 'w') as f:
    json.dump(param_grid, f)

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Start an MLflow run
with mlflow.start_run(run_name="ridge_regression_grid_search"):
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Best model from grid search
    best_model = grid_search.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))

    score=r2_score(y_test,reg_pred)
    print(score)

    # Log parameters
    mlflow.log_params(grid_search.best_params_)
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)

    mlflow.log_artifact(param_grid_filename)

    # Log each combination of hyperparameters
    for i in range(len(grid_search.cv_results_['params'])):
        with mlflow.start_run(run_name=f"grid_search_{i}", nested=True):
            params = grid_search.cv_results_['params'][i]
            mean_test_score = -grid_search.cv_results_['mean_test_score'][i]
            std_test_score = grid_search.cv_results_['std_test_score'][i]
            
            mlflow.log_params(params)
            mlflow.log_metric("mse", mean_test_score)
            mlflow.log_metric("mae", std_test_score)

    
    # Log the model
    mlflow.sklearn.log_model(best_model, "ridge_regression_model")
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Model logged with MSE: {mse}")



0.695453788333041




Best parameters: {'alpha': 1.0, 'solver': 'auto'}
Model logged with MSE: 22.986067479524895


XGBoost Hyper Parameter Tuning

In [49]:
import xgboost as xgb
import mlflow
import mlflow.xgboost
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate or load data
X = np.random.rand(100, 10)
y = np.random.rand(100)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Define the model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Define GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Start an MLflow run
with mlflow.start_run(run_name="xgboost_grid_search"):
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Log the best parameters and metrics for this trial
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)

    # Log the best model
    mlflow.xgboost.log_model(best_model, "best_xgboost_model")

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")

    # Log each trial
    for i, (params, mean_score, _) in enumerate(zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['std_test_score'])):
        # Start a new MLflow run for each trial
        with mlflow.start_run(run_name=f"trial_{i}", nested=True):
            mlflow.log_params(params)
            mlflow.log_metric("mse", mean_test_score)
            mlflow.log_metric("mae", std_test_score)


Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
Mean Squared Error: 0.07660191959996616
Mean Absolute Error: 0.24000931280543059


## Pickling The Best Model file For Deployment

In [44]:
import pickle
pickle.dump(best_model,open('best_reg_model.pkl','wb'))