In [33]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [34]:
mlflow.set_tracking_uri("http://localhost:5000")

df = pd.read_csv("../../data/raw/manual-delete-data.csv")
df = df.drop('customerID', axis=1)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [35]:
def preprocess_data(df):
    categorical_columns = [
        'gender',
        'Partner',
        'Dependents',
        'PhoneService',
        'MultipleLines',
        'InternetService',
        'OnlineSecurity',
        'OnlineBackup',
        'DeviceProtection',
        'TechSupport',
        'StreamingTV',
        'StreamingMovies',
        'Contract',
        'PaperlessBilling',
        'PaymentMethod'
    ]

    for col in categorical_columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    numerical_columns = [
        'MonthlyCharges',
        'TotalCharges',
        'tenure'
    ]

    df[numerical_columns] = StandardScaler().fit_transform(df[numerical_columns])


    target_column = [
        'Churn'
    ]
    df['Churn'] = LabelEncoder().fit_transform(df['Churn'].astype(str))

In [36]:
def train_initial_model(X_train, y_train, X_test, y_test):
    """Train the initial logistic regression model and log it to MLflow."""
    with mlflow.start_run(run_name="Initial_Model_Training"):
        model = LogisticRegression(random_state=42, max_iter=1000)
        model.fit(X_train, y_train)

        # Log model parameters
        mlflow.log_param("model_type", "LogisticRegression")
        mlflow.log_param("max_iter", 1000)

        # Log model metrics
        accuracy = accuracy_score(y_test, model.predict(X_test))
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("roc_auc", roc_auc)

        # Log the model
        mlflow.sklearn.log_model(model, "model")

        print(f"Model trained with accuracy: {accuracy}, ROC AUC: {roc_auc}")

In [37]:
def calculate_psi(baseline, current, bins=10):
    """Calculate Population Stability Index (PSI) for a given feature."""
    baseline_bins = np.histogram(baseline, bins=bins)[0] / len(baseline)
    current_bins = np.histogram(current, bins=bins)[0] / len(current)

    psi = np.sum((baseline_bins - current_bins) * np.log(baseline_bins / current_bins))
    return psi

In [38]:
def monitor_drift_and_retrain(baseline, current, psi_threshold, X_train, y_train, X_test, y_test):
    """Monitor drift using PSI and retrain the model if necessary."""
    psi_value = calculate_psi(baseline, current)
    print(f"Calculated PSI: {psi_value}")

    # Log PSI to MLflow
    with mlflow.start_run(run_name="Drift_Monitoring"):
        mlflow.log_metric("PSI", psi_value)

        if psi_value > psi_threshold:
            print("Drift detected. Retraining the model...")

            # Retrain the model
            with mlflow.start_run(run_name="Model_Retraining"):
                model = LogisticRegression(random_state=42, max_iter=1000)
                model.fit(X_train, y_train)

                # Log retrained model parameters
                mlflow.log_param("model_type", "LogisticRegression")
                mlflow.log_param("max_iter", 1000)

                # Log retrained model metrics
                accuracy = accuracy_score(y_test, model.predict(X_test))
                roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("roc_auc", roc_auc)

                # Log the retrained model
                mlflow.sklearn.log_model(model, "model", registered_model_name="CustomerChurnModel")

                print(f"Retrained model logged with accuracy: {accuracy}, ROC AUC: {roc_auc}")
        else:
            print("No significant drift detected. No retraining required.")


In [39]:
def analyze_correlation(data):
    """Analyze and print correlation between features."""
    correlation_matrix = data.corr(numeric_only=False)
    print("Feature Correlation Matrix:\n", correlation_matrix)
    return correlation_matrix


In [40]:
# # encode categorical column
# encoders = {}
# for column in categorical_columns:
#     le = LabelEncoder()
#     df[column] = le.fit_transform(df[column])
#     encoders[column] = le

# # encode target column
# le = LabelEncoder()
# df['Churn'] = le.fit_transform(df['Churn'])
# encoders['Churn'] = le


# df.info()
# df.head()

In [41]:
# corr_coef = df.corr(numeric_only=True).abs()
# threshold = 0.7

# dependent_var = []
# for index, row in corr_coef.iterrows():
#     for column, value in row.items():
#         if value > threshold and column > index:
#             dependent_var.append((index, column))

# print(f"Data berkorelasi tinggi:\n{dependent_var}")

In [45]:
df.copy(deep=True)
preprocess_data(df)

X = df.drop(columns=['Churn'])
y = df['Churn']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# Train initial model
train_initial_model(X_train, y_train, X_test, y_test)

# Monitor drift and retrain if necessary
monitor_drift_and_retrain(X_train['MonthlyCharges'], X_test['MonthlyCharges'], psi_threshold=0.1, 
                        X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)


# model_lr = LogisticRegression(max_iter=10000)
# model_lr.fit(X_train, y_train)
# y_pred = model_lr.predict(X_val)

# cm = confusion_matrix(y_val, y_pred)
# report = classification_report(y_val, y_pred)

# print(f"Report: \n{report}")
# print(f"Confusion Matrix: \n{cm}")



Model trained with accuracy: 0.8073916133617626, ROC AUC: 0.8594218724044599
🏃 View run Initial_Model_Training at: http://localhost:5000/#/experiments/0/runs/186134faef4c43c0b54235f8cfcf4017
🧪 View experiment at: http://localhost:5000/#/experiments/0
Calculated PSI: 0.008203056990974936
No significant drift detected. No retraining required.
🏃 View run Drift_Monitoring at: http://localhost:5000/#/experiments/0/runs/0937725f2ee043a88d15e4f2551b2e20
🧪 View experiment at: http://localhost:5000/#/experiments/0
