In [1]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
data = pd.read_csv("data/cleaned_customer.csv",index_col=0)
data.head(3)

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
0,13,460,3,4,3,1,5,3,5,5,...,5,5,25,18.0,0,1,0,1,0,1
1,25,235,3,2,3,3,1,3,1,1,...,4,1,1,6.0,0,1,1,0,0,0
2,26,1142,2,2,2,2,5,5,5,5,...,4,5,0,0.0,1,0,0,0,0,0


In [3]:
if data.isnull().sum().any():
    # Impute missing values or drop them
    data.fillna(data.mean(), inplace=True)


In [4]:
data = data.astype({col: 'float64' for col in data.select_dtypes('int64').columns})


In [5]:
X = data.drop("satisfaction", axis=1)  
y = data["satisfaction"]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
mlflow.set_experiment("Customer_Satisfaction_Models")


<Experiment: artifact_location='file:///Users/priyankamalavade/Desktop/mlflow_miniproject/mlruns/194862450230422006', creation_time=1739026038958, experiment_id='194862450230422006', last_update_time=1739026038958, lifecycle_stage='active', name='Customer_Satisfaction_Models', tags={}>

In [9]:
# Log confusion matrix as a visual to MLflow
def log_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=y_true.unique(), yticklabels=y_true.unique())
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(f"cm_{model_name}.png")
    mlflow.log_artifact(f"cm_{model_name}.png")
    plt.close()


In [10]:
# Function to train and log models
def train_and_log_model(model, model_name):
    """Train the model, log parameters, metrics, and visuals in MLflow."""
    with mlflow.start_run():
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Get class labels from y_test
        class_labels = list(y_test.unique())
        
        # Log parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Log classification report details for each class (dynamically access classes)
        for label in class_labels:
            mlflow.log_metric(f"precision_class_{label}", report[str(label)]['precision'])
            mlflow.log_metric(f"recall_class_{label}", report[str(label)]['recall'])

        # Log confusion matrix
        log_confusion_matrix(y_test, y_pred, model_name)
        
        # Log the model with signature
        signature = infer_signature(X_train_scaled, y_train)
        mlflow.sklearn.log_model(model, model_name, signature=signature)

        print(f"{model_name} - Accuracy: {accuracy}, F1-Score: {f1}")


In [11]:
# Train and Log Five Models
# Logistic Regression
log_reg = LogisticRegression()
train_and_log_model(log_reg, "Logistic Regression")

# Random Forest Classifier
random_forest = RandomForestClassifier()
train_and_log_model(random_forest, "Random Forest")

# Gradient Boosting Classifier
grad_boost = GradientBoostingClassifier()
train_and_log_model(grad_boost, "Gradient Boosting")

# Support Vector Machine
svc = SVC()
train_and_log_model(svc, "Support Vector Machine")

# K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()
train_and_log_model(knn, "K-Nearest Neighbors")
 

Logistic Regression - Accuracy: 0.8765218228189211, F1-Score: 0.8556480648064807
Random Forest - Accuracy: 0.9605408786872625, F1-Score: 0.9539946140035906
Gradient Boosting - Accuracy: 0.9407631971512439, F1-Score: 0.9310402778555823
Support Vector Machine - Accuracy: 0.9538039555363073, F1-Score: 0.9462726662189389
K-Nearest Neighbors - Accuracy: 0.9286367354795245, F1-Score: 0.9150386708679461


## Hyperparameter Tuning For Random Forest

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Define the model
random_forest = RandomForestClassifier(random_state=42)


In [14]:
# Set up the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [15]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')


In [16]:
# Start MLflow run
with mlflow.start_run():

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters: ", best_params)
    
    # Best model
    best_rf = grid_search.best_estimator_

    # Make predictions with the best model
    y_pred_best_rf = best_rf.predict(X_test)

    # Calculate metrics
    accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
    f1_best_rf = f1_score(y_test, y_pred_best_rf)

    print(f"Best Random Forest - Accuracy: {accuracy_best_rf}, F1-Score: {f1_best_rf}")

    # Log parameters, metrics, and model to MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy_best_rf)
    mlflow.log_metric("f1_score", f1_best_rf)

    # Log the Random Forest model
    mlflow.sklearn.log_model(best_rf, "random_forest_model")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  22.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  21.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  21.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  36.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  15.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  39.9s
[CV] END max_depth=10, min_sa



Best Random Forest - Accuracy: 0.9626100765121987, F1-Score: 0.9563752737072596