# Random Forest â€” Standalone Notebook (Excel)

This notebook trains **one model** on your Excel dataset. Set CONFIG below and Run All.

In [1]:
# ---- CONFIG: EDIT THESE ----
DATASET_XLSX_PATH = '/Users/srilalitha_gunturu@optum.com/Downloads/Customer Churn.csv'   # e.g., data/Heart_Disease.xlsx
TARGET_COLUMN     = 'Churn'              # e.g., 'target'
RANDOM_STATE      = 42
TEST_SIZE         = 0.2
# ----------------------------

import pandas as pd
import numpy as np
import sklearn

# Excel loader with CSV fallback
_df = None
try:
    _df = pd.read_excel(DATASET_XLSX_PATH, engine='openpyxl')
    print(f"[INFO] Loaded Excel file: {DATASET_XLSX_PATH}")
except Exception as _e:
    print("[WARN] Excel load failed, trying CSV fallback...")
    for _enc in ['utf-8','utf-8-sig','latin-1','cp1252']:
        try:
            _df = pd.read_csv(DATASET_XLSX_PATH, encoding=_enc)
            print(f"[INFO] Loaded CSV fallback with encoding={_enc}")
            break
        except Exception:
            pass
    if _df is None:
        raise ValueError("Could not load dataset. Check path/format.")

df = _df
assert TARGET_COLUMN in df.columns, f"Target '{TARGET_COLUMN}' not in columns: {df.columns.tolist()}"

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df[TARGET_COLUMN])
X = df.drop(columns=[TARGET_COLUMN])

categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols     = X.select_dtypes(include=['number']).columns.tolist()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# OHE compatibility
_ohe_sparse = {'handle_unknown':'ignore'}
_ohe_dense  = {'handle_unknown':'ignore'}
_maj,_min = [int(v) for v in sklearn.__version__.split('.')[:2]]
if (_maj,_min) >= (1,2):
    _ohe_sparse['sparse_output'] = True
    _ohe_dense['sparse_output']  = False
else:
    _ohe_sparse['sparse'] = True
    _ohe_dense['sparse']  = False

preprocessor_sparse = ColumnTransformer([
    ('cat', OneHotEncoder(**_ohe_sparse), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

preprocessor_dense = ColumnTransformer([
    ('cat', OneHotEncoder(**_ohe_dense), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)


[WARN] Excel load failed, trying CSV fallback...
[INFO] Loaded CSV fallback with encoding=utf-8


In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score

def safe_auc(y_true, y_proba):
    """
    Returns AUC or None if it cannot be computed.
    - Handles binary (uses proba[:,1]) and multiclass (OvR, weighted).
    - Returns None if y_true has a single class or probabilities are missing.
    """
    try:
        if len(np.unique(y_true)) < 2:
            return None
        if y_proba is None:
            return None

        y_proba = np.asarray(y_proba)
        if y_proba.ndim == 1:
            # Already the positive-class probability
            return float(roc_auc_score(y_true, y_proba))
        elif y_proba.ndim == 2:
            if y_proba.shape[1] == 2:
                # Probability of positive class
                return float(roc_auc_score(y_true, y_proba[:, 1]))
            else:
                # Multiclass OvR weighted
                return float(roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted'))
        return None
    except Exception:
        return None

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('prep', preprocessor_sparse),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
])
clf = pipe.fit(X_train, y_train)


In [4]:
# Evaluate
proba = clf.predict_proba(X_test) if hasattr(clf, 'predict_proba') else None
pred  = clf.predict(X_test)

from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, confusion_matrix, classification_report)

# Predictions and probabilities
y_pred = clf.predict(X_test)
y_proba = None
if hasattr(clf.named_steps['clf'], 'predict_proba'):
    try:
        y_proba = clf.predict_proba(X_test)
    except Exception:
        y_proba = None

acc = accuracy_score(y_test, pred)
prec= precision_score(y_test, pred, average='weighted', zero_division=0)
rec = recall_score(y_test, pred, average='weighted', zero_division=0)
f1  = f1_score(y_test, pred, average='weighted', zero_division=0)
mcc = matthews_corrcoef(y_test, pred)
cm  = confusion_matrix(y_test, pred)
rep = classification_report(y_test, pred, output_dict=False, zero_division=0)
auc = safe_auc(y_test,y_proba)

print({'accuracy': round(acc,4), 'auc': None if auc is None else round(auc,4),
       'precision': round(prec,4), 'recall': round(rec,4), 'f1': round(f1,4), 'mcc': round(mcc,4)})
print('Confusion Matrix:', cm)
print('Classification Report:')
print(rep)


{'accuracy': 0.9651, 'auc': 0.9877, 'precision': 0.9645, 'recall': 0.9651, 'f1': 0.9645, 'mcc': 0.8648}
Confusion Matrix: [[524   7]
 [ 15  84]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       531
           1       0.92      0.85      0.88        99

    accuracy                           0.97       630
   macro avg       0.95      0.92      0.93       630
weighted avg       0.96      0.97      0.96       630

