In [7]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb
import pandas as pd
import shap
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report
from customer_churn_pridiction.config import PROCESSED_DATA_DIR

In [14]:
train_data = pd.read_csv(PROCESSED_DATA_DIR / 'train.csv')
test_data = pd.read_csv(PROCESSED_DATA_DIR / 'test.csv')
target = 'Churn'

X_train = train_data.drop(columns=[target])
Y_train = train_data[target]
X_test = test_data.drop(columns=[target])
Y_test = test_data[target]

In [17]:
standard_scaler = joblib.load(PROCESSED_DATA_DIR / 'standard_scaler.joblib')
one_hot_encoder = joblib.load(PROCESSED_DATA_DIR / 'one_hot_encoder.joblib')

In [19]:

X_train_encoded = one_hot_encoder.transform(X_train.select_dtypes(include=['object']))

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=one_hot_encoder.get_feature_names_out(X_train.select_dtypes(include=['object']).columns.tolist()))

X_train = pd.concat([X_train.select_dtypes(exclude=['object']), X_train_encoded_df], axis=1)

In [20]:
X_test_encoded = one_hot_encoder.transform(X_test.select_dtypes(include=['object']))

X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=one_hot_encoder.get_feature_names_out(X_test.select_dtypes(include=['object']).columns.tolist()))

X_test = pd.concat([X_test.select_dtypes(exclude=['object']), X_test_encoded_df], axis=1)

In [25]:
mlflow.set_experiment('customer_churn_prediction_xgboost_experiment')
with mlflow.start_run():
    model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=0)
    model.fit(standard_scaler.transform(X_train), Y_train)

    mlflow.log_param('model', 'XGBoost')
    mlflow.log_param('n_estimators', 100)
    mlflow.log_param('max_depth', 5)
    mlflow.log_param('learning_rate', 0.1)
    mlflow.log_param('random_state', 0) 

    Y_train_pred = model.predict(standard_scaler.transform(X_train))
    Y_test_pred = model.predict(standard_scaler.transform(X_test))

    train_accuracy = accuracy_score(Y_train, Y_train_pred)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)

    mlflow.log_metric('train_accuracy', train_accuracy)
    mlflow.log_metric('test_accuracy', test_accuracy)

    mlflow.sklearn.log_model(model, 'model')

    mlflow.log_artifact(PROCESSED_DATA_DIR / 'standard_scaler.joblib')
    mlflow.log_artifact(PROCESSED_DATA_DIR / 'one_hot_encoder.joblib')

    print(f'Train accuracy: {train_accuracy}')
    print(f'Test accuracy: {test_accuracy}')

    print('Classification report for test data')
    print(classification_report(Y_test, Y_test_pred))

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(standard_scaler.transform(X_test))

    shap.summary_plot(shap_values, X_test, plot_type='bar')
    plt.savefig('shap_summary_plot.png')
    mlflow.log_artifact('shap_summary_plot.png')

    shap.summary_plot(shap_values, X_test)
    plt.savefig('shap_summary_plot.png')
    mlflow.log_artifact('shap_summary_plot.png')

    shap.dependence_plot('Contract_Month-to-month', shap_values, X_test)
    plt.savefig('shap_dependence_plot.png')
    mlflow.log_artifact('shap_dependence_plot.png')

    shap.dependence_plot('Tenure', shap_values, X_test)
    plt.savefig('shap_dependence_plot.png')
    mlflow.log_artifact('shap_dependence_plot.png')
    

MlflowException: Experiment 'customer_churn_prediction_xgboost_experiment' already exists.