In [0]:
pip install mlflow

In [0]:
pip install --upgrade typing-extensions

In [0]:
dbutils.library.restartPython()

In [0]:
artifact_path = '/Workspace/Users/arorapuneet1998@gmail.com/MLFlow/Artfacts/'

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [0]:
import mlflow as ml
import mlflow.sklearn
from mlflow.models import infer_signature

In [0]:
!mlflow --version

In [0]:
mlflow.set_experiment("/Workspace/Users/arorapuneet1998@gmail.com/MLFlow/Customer_Churn_Prediction_DMML_Assignment")

### Loading theChurn Dataset and making the file structure for storing the artifacts

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
try:
    # df = pd.read_csv('DMML_TRAINING_DATA_churn_data.csv')
    df = spark.table('dmml.raw.churn_training_data').limit(100).toPandas()
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: FILE not found. Please ensure the file is in the same directory.")
    exit() # Exit if the file is not found

## Data Exploration and Visualization -Performing initial analysis

#### Display the first 5 rows of the dataframe

In [0]:
print("\nInitial Data Head:")
print(df.head())

#### Data Information & schema

In [0]:
df.info()

#### Descriptive Statistics: includes count, mean , std and 5 point summary for all numeric columns

In [0]:
print(df.describe())

# saving it as a csv for the purpose of saving it inside mlflow
summary_stats = df.describe()

# Save summary statistics to CSV
summary_path = f"{artifact_path}summary_statistics.csv"
summary_stats.to_csv(summary_path)

In [0]:
type(summary_stats)

#### checking for Missing Values

In [0]:
print(df.isnull().sum())

In [0]:
print('Check for Duplicate CustomerID')

if df['CustomerID'].duplicated().any():
    print("\nWarning: Duplicate user_ids found. Considering unique user_ids for analysis.")
    df.drop_duplicates(subset=['CustomerID'], inplace=True)
    print(f"Removed duplicates. New shape: {df.shape}")
else:
    print('No Duplicate User id found')

In [0]:

# 1. Bar chart for the distribution of the target variable (will_churn)
plt.figure(figsize=(7, 5))
sns.countplot(x='Churn', data=df, palette='viridis')
plt.title('Distribution of Target Variable (will_churn)')
plt.xlabel('Will Churn')
plt.ylabel('Number of Customers')
plt.xticks(ticks=[0, 1], labels=['No Churn (FALSE)', 'Churn (TRUE)'])
plt.savefig(f"{artifact_path}churn_distribution.png")
plt.show()

In [0]:
churn_counts = df['Churn'].value_counts()

print(f"\nDistribution of 'will_churn':\n{churn_counts}")
print(f"Churn percentage: {churn_counts['Yes'] / len(df) * 100:.2f}%")
print(f"\nNo Churn percentage: {churn_counts['No'] / len(df) * 100:.2f}%")

if churn_counts['Yes'] / len(df) > 0.6 or churn_counts['Yes'] / len(df) < 0.4:
    print("The dataset appears to be imbalanced.")
else:
    print("The dataset appears to be relatively balanced.")

#### Histograms for numerical features

In [0]:

numerical_features = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']

plt.figure(figsize=(15, 5))

for i, feature in enumerate(numerical_features):
    plt.subplot(1, len(numerical_features), i + 1)
    sns.histplot(df[feature], kde=True, bins=20, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    
plt.tight_layout()
plt.savefig(f"{artifact_path}numerical_feature_distributions.png")
plt.show()


#### Bar charts for categorical features 

In [0]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('CustomerID')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

# Plot each categorical variable
for i, col in enumerate(categorical_cols):
    sns.countplot(x=col, data=df, palette='pastel', ax=axes[i])
    axes[i].set_title(f'Distribution of {col}', fontsize=14)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Number of Customers')
    axes[i].tick_params(axis='x', rotation=30)  # Rotate x-axis labels

# Adjust layout
plt.tight_layout()
plt.savefig(f"{artifact_path}categorical_feature_distributions.png")
plt.show()

#### Correlation heatmap for numerical features

In [0]:
df_corr = df[numerical_features].copy()
df_corr['Tenure'] = df['Tenure'].astype(int)
df_corr['MonthlyCharges'] = df['MonthlyCharges'].astype(int)
df_corr['TotalCharges'] = df['TotalCharges'].astype(int)

In [0]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_corr.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Features')
plt.savefig(f"{artifact_path}correlation_heatmap.png")
plt.show()

## Data Preprocessing

### In Preprocessing step we will do following operations:

* Define features (X) and target (y)
* Handle Categorical Data: Convert categorical features into numerical format using One-Hot Encoding.
* Feature Scaling: Apply StandardScaler numerical features.
* Divide data into a training set and a testing set.

#### Define features (X) and target (y)

In [0]:
print("Droping user_id as it iss an identifier and not a predictive feature")
X = df.drop(['CustomerID', 'Churn'], axis=1) # Features

y = df['Churn'] # Target Variable

#### Convert categorical features into numerical format using One-Hot Encoding

In [0]:
from sklearn.preprocessing import OneHotEncoder
import joblib




In [0]:
from sklearn.preprocessing import OneHotEncoder
import joblib

categorical_cols = ['Gender', 'ContractType', 'PaymentMethod']

# Correct way for scikit-learn 1.2+
encoder = OneHotEncoder(handle_unknown='ignore', sparse=True)
encoder.fit(X[categorical_cols])

# Save the encoder
joblib.dump(encoder, 'encoder.pkl')


# Identify categorical columns for one-hot encoding
categorical_cols =['Gender','ContractType','PaymentMethod']


# Apply One-Hot Encoding
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) # drop_first avoids multicollinearity

print("\nFeatures after One-Hot Encoding and Boolean Conversion:")
print(X.head())
print(f"Shape after encoding: {X.shape}")


In [0]:
X

#### Scaling - of numerical features.

Explanation for Feature Scaling:

Feature scaling is crucial because many machine learning algorithms (like Logistic Regression, SVMs, Neural Networks, and even distance-based algorithms like K-Nearest Neighbors) are sensitive to the magnitude and range of input features.
If features have vastly different scales, the feature with a larger range might dominate the cost function or distance calculations, leading to suboptimal model performance.
StandardScaler transforms the data to have a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the model, preventing features with larger numerical values from disproportionately influencing the model's learning process.
For tree-based models like Random Forest, scaling is less critical but can sometimes still offer minor benefits or consistency in pipelines.


In [0]:
# Identify numerical features that need scaling (excluding the one-hot encoded ones which are already 0/1)
# 'age', 'monthly_watch_hours', 'devices_used', 'customer_support_calls', 'last_active_days_ago'

features_to_scale = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']

scaler = StandardScaler()
X[features_to_scale] = scaler.fit_transform(X[features_to_scale])

print("\nFeatures after Scaling (first 5 rows of scaled columns):")
print(X[features_to_scale].head())

#### Data Split: into a training set and a testing set.

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"\nTraining set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")

#### Logging the data and EDA into MLFLow

In [0]:
f'{artifact_path}raw_data.csv'

In [0]:
# with mlflow.start_run(run_name="01_RawData_and_EDA"):
    
#     # Logging the raw data and summary statistics
#     mlflow.log_artifact(f"{artifact_path}raw_data.csv", artifact_path="data")
#     mlflow.log_artifact(f"{artifact_path}summary_statistics.csv", artifact_path="eda")

#     # Log all EDA visualizations
#     for file in os.listdir("artifacts"):
#         if file.endswith(".png"):
#             mlflow.log_artifact(os.path.join("artifacts", file), artifact_path="eda")

## MLFlow Individual Model Logging

#### Defining the individual models and the variables

In [0]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

best_recall = -1
best_model = None
best_model_name = None

#### All Individual Model Logging Run Training and Testing

In [0]:
from sklearn.metrics import auc, precision_recall_curve, roc_curve

# Convert y_test and y_pred to binary format for metrics
y_test_bin = y_test.map({'No': 0, 'Yes': 1})

with mlflow.start_run(run_name="02_AllModels_Metrics"):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_bin = pd.Series(y_pred).map({'No': 0, 'Yes': 1}).values
        y_proba = model.predict_proba(X_test)[:, 1]

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test_bin, y_pred_bin)
        rec = recall_score(y_test_bin, y_pred_bin)
        f1 = f1_score(y_test_bin, y_pred_bin)

        # Log metrics per model
        mlflow.log_metric(f"{name}_accuracy", acc)
        mlflow.log_metric(f"{name}_precision", prec)
        mlflow.log_metric(f"{name}_recall", rec)
        mlflow.log_metric(f"{name}_f1", f1)

        # Confusion Matrix
        cm = confusion_matrix(y_test_bin, y_pred_bin)
        plt.figure()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title(f"Confusion Matrix - {name}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        cm_file = f"conf_matrix_{name}.png"
        plt.savefig(cm_file)
        mlflow.log_artifact(cm_file, artifact_path="confusion_matrices")

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test_bin, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - {name}")
        plt.legend()
        roc_file = f"roc_curve_{name}.png"
        plt.savefig(roc_file)
        mlflow.log_artifact(roc_file, artifact_path="roc_curves")

        # Precision-Recall Curve
        precision_vals, recall_vals, _ = precision_recall_curve(y_test_bin, y_proba)
        plt.figure()
        plt.plot(recall_vals, precision_vals, label=name)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"Precision-Recall Curve - {name}")
        pr_file = f"pr_curve_{name}.png"
        plt.savefig(pr_file)
        mlflow.log_artifact(pr_file, artifact_path="precision_recall_curves")

        # Track best model
        if rec > best_recall:
            best_recall = rec
            best_model_name = name
            best_model = model

#### MLFlow run for logging the base model

In [0]:
import pickle


with mlflow.start_run(run_name=f"03_BestModel_{best_model_name}"):
    mlflow.log_param("model_name", best_model_name)
    mlflow.log_metric("best_recall", best_recall)

    # Save and register best model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        registered_model_name="BestChurnModel_by_Recall"
    )

    # Also save explicit pkl
    with open("best_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
    mlflow.log_artifact("best_model.pkl", artifact_path="model")

#### Running the MLFlow UI

In [0]:
import mlflow
tracking = mlflow.get_tracking_uri()
tracking

In [0]:
import subprocess

# Launch MLflow UI
subprocess.Popen([
    "mlflow", "ui",
    "--port", "5000",
    "--backend-store-uri", tracking
])

In [0]:
!pkill -f "mlflow ui"

In [0]:
!mlflow ui --port 5000 --backend-store-uri "file:///c:/Users/Charu%20Anant%20Rajput/OneDrive/Desktop/BITS%20Study%20Mateials/BITS%20Sem2/DataManagementForML/Assignment/DMML_Assignment/MLFlow/mlruns"