In [None]:
"""
This project demonstrates how to use MLflow to practice tracking and versioning of datasets, metrics, models, parameters
while developing a machine learning experiment.

This project is a customer churn prediction using classical machine learning models .

The data is a cleaned, preprocessed version of the Iranian Customer Churn Prediction dataset.

The data is already split into train, test and valid datasets.

The experimented models are: Random Forest, Support Vector Machine and XGBoost.

The experimented models parameters are: Random Forest(n_estimators), and Support Vector Machine(kernel).

The logged metrics are : Balanced Accuracy, Geometric Mean, Matthews Correlation Coefficient, Precision, Recall, F1 Score, and Confusion Metrix.

The run id is updated dynamically using a date and time stamp.

Model artifacts are logged as MLflow models.

Modular programming is used for the project to make it easier to maintain and debug.

Feel free to use this project as a template for your own machine learning experiments, and tune the parameters to your needs.

"""

'Try using MLflow Tracking to log metrics and parameters from a machine learning experiment.\n\nPackage up a simple machine learning model as an MLflow Project. \n\nUse the MLflow UI to compare multiple runs of an experiment\n\nTry deploying a machine learning model using the MLflow Model Registry and MLflow Deployments for LLMs. Observe how model governance and access control can be implemented.\n\nExplore using MLflow with a specific machine learning library like PyTorch or TensorFlow. See how logging model artifacts as MLflow models allows framework-agnostic deployment.  \n\n'

In [None]:
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn import metrics
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import matthews_corrcoef

import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import xgboost as xgb

import mlflow
from mlflow.models import infer_signature
from mlflow import log_metric

In [3]:
#initilaization
# mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('customer_churn_prediction')

2025/07/30 22:14:00 INFO mlflow.tracking.fluent: Experiment with name 'customer_churn_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/PC/Desktop/current_focus/mlflow/mlflow_app/mlruns/430583840645811508', creation_time=1753902840892, experiment_id='430583840645811508', last_update_time=1753902840892, lifecycle_stage='active', name='customer_churn_prediction', tags={}>

In [4]:
#load the customer churn data
def load_data():
    
    df_train = pd.read_csv('telecome_customer_churn/train.csv')
    df_test = pd.read_csv('telecome_customer_churn/test.csv')
    df_valid = pd.read_csv('telecome_customer_churn/valid.csv')
    
    return df_train, df_test, df_valid

In [5]:
def split_data(df, label=' Class'):
    
    X = df.drop([label], axis=1)
    y = df[label].astype('int')
    
    return X, y

In [6]:
def report_results(model, X_test, y_test, name, verbose=False):

    pred_y_test=model.predict(X_test)

    bacc=round(balanced_accuracy_score(y_test,pred_y_test)*100,2)
    f1=round(metrics.f1_score(y_test,pred_y_test)*100,2)
    gm=round(geometric_mean_score(y_test,pred_y_test, average='weighted')*100,2)
    mcc=round(matthews_corrcoef(y_test,pred_y_test)*100,2)
    CM=metrics.confusion_matrix(y_test, pred_y_test)
    TN = CM[0][0]
    FN = CM[1][0] 
    TP = CM[1][1] 
    FP = CM[0][1]

    if verbose:
        print(name+'_Confusion Matrix', CM)
        print(name+'_Bacc',bacc)
        print(name+'_F1',f1)
        print(name+'_GM',gm)
        print(name+'_MCC',mcc)

    log_metric(name+'_TN', TN)
    log_metric(name+'_FN', FN)
    log_metric(name+'_TP', TP)
    log_metric(name+'_FP', FP)  
    log_metric(name+'_Recall', TP/(TP+FN))
    log_metric(name+'_Precision', TP/(TP+FP))
    log_metric(name+'_Bacc',bacc)
    log_metric(name+'_F1',f1)
    log_metric(name+'_GM',gm)
    log_metric(name+'_MCC',mcc)

In [7]:
def make_classifier_RF(Xtrain,Ytrain,n_estimators):
    RF = RandomForestClassifier(n_estimators) 
    RF.fit(Xtrain,Ytrain)
    return RF

In [8]:
def make_classifier_SVM(Xtrain,Ytrain, kernel='rbf'):
    svmc = svm.SVC(kernel=kernel, probability=True)
    svmc.fit(Xtrain,Ytrain)
    return svmc

In [9]:
def make_classifier_xgboost(Xtrain,Ytrain):
    xgbc = xgb.XGBClassifier()
    xgbc.fit(Xtrain,Ytrain)
    return xgbc

In [10]:
def get_data():
    df_train, df_test, df_valid = load_data()
    X_train, y_train = split_data(df_train)
    X_valid, y_valid = split_data(df_valid)
    data = {'X_train':X_train,'y_train':y_train,'X_valid':X_valid,'y_valid':y_valid}
    return data,df_train, df_test, df_valid
data,df_train, df_test, df_valid = get_data()

In [11]:
def make_a_dataset(objective:str,df_in):

    id:str = datetime.datetime.now().strftime("%d/%b/%Y:%H_%M_%S")

    with mlflow.start_run():

        mlflow.set_tag("mlflow.runName", 'CCP_dataset/'+objective+'/'+str(id))

        if objective not in ['test','valid','train']:
            raise ValueError('objective must be train, test or valid')
        
        train_dataset = mlflow.data.from_pandas(
            df=df_in,
            source='telecome_customer_churn/'+objective+'.csv', # Example source
            name="Iranian Customer Churn Dataset",
            targets=" Class"
        )
        mlflow.log_input(dataset=train_dataset, context=objective)

    mlflow.end_run()

    return 
make_a_dataset('train',df_train)
make_a_dataset('test',df_test)
make_a_dataset('valid',df_valid)


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [None]:
def make_a_run(run_name:str, model_name:str,
               model_params:dict, model_func:object,
               data:dict, infered_signature:object):
    
    id:str = datetime.datetime.now().strftime("%d_%b_%Y_%H_%M_%S")

    with mlflow.start_run():

        mlflow.set_tag("mlflow.runName", run_name+'_'+id)

        model = model_func(data['X_train'],data['y_train'],**model_params)

        report_results(model, data['X_train'],data['y_train'], 'train')
        report_results(model, data['X_valid'],data['y_valid'], 'valid')

        mlflow.sklearn.log_model(
            sk_model=model,
            name=model_name+'_'+str(id),
            signature=infered_signature,
            input_example=data['X_train'][:1],
            params=model_params
            )
        
        mlflow.end_run()

In [13]:
make_a_run(run_name='RF_CCP',
           model_name='RF',
           model_params={'n_estimators':200},
           model_func=make_classifier_RF,
           data=data,
           infered_signature=infer_signature(data['X_train'],data['y_train'])
           )



In [14]:
make_a_run(run_name='SVM_CCP',
           model_name='SVM_',
           model_params={'kernel':'rbf'},
           model_func=make_classifier_SVM,
           data=data,
           infered_signature=infer_signature(data['X_train'],data['y_train'])
           )



In [15]:
make_a_run(run_name='XGB_CCP',
           model_name='XGB',
           model_params={},
           model_func=make_classifier_xgboost,
           data=data,
           infered_signature=infer_signature(data['X_train'],data['y_train'])
           )

