<a href="https://colab.research.google.com/github/sharbt/telco_churn_predictor/blob/main/TelcoChurn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
DATAFILE = "telco_customer_churn.xlsx"
RANDOM_STATE = 42

In [4]:
p = Path.cwd() / DATAFILE
if not p.exists():
    raise FileNotFoundError(f"Place '{DATAFILE}' in the working directory: {Path.cwd()}")
df = pd.read_excel(p)
df.columns = [str(c).strip().replace(" ", "_").lower() for c in df.columns]
print("Loaded:", df.shape)

Loaded: (1000, 14)


In [5]:
if "churn" not in df.columns:
    raise KeyError("No 'churn' column found. Rename target to 'churn'.")
df = df[~df["churn"].isnull()].copy()
df["churn"] = df["churn"].astype(str).str.strip().str.lower().map({"yes":1,"no":0})
df = df[~df["churn"].isnull()].copy()
y = df["churn"].astype(int)


In [6]:
X = df.drop(columns=["churn"])
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
# coerce numeric-like object columns
for c in list(cat_cols):
    coerced = pd.to_numeric(X[c], errors="coerce")
    if coerced.notnull().sum() / len(X) > 0.9:
        X[c] = coerced
        num_cols.append(c)
        cat_cols.remove(c)
print("Numeric cols:", len(num_cols), "Categorical cols:", len(cat_cols))

Numeric cols: 4 Categorical cols: 9


In [7]:
# ---------------- Cell 5: Simple preprocessing ----------------
# numeric fill & scale
if num_cols:
    X_num = X[num_cols].fillna(X[num_cols].median())
    scaler = StandardScaler()
    X_num = pd.DataFrame(scaler.fit_transform(X_num), columns=num_cols, index=X.index)
else:
    X_num = pd.DataFrame(index=X.index)
# categorical one-hot
if cat_cols:
    X_cat = pd.get_dummies(X[cat_cols].fillna("missing"), drop_first=True)
else:
    X_cat = pd.DataFrame(index=X.index)
# combine
X_proc = pd.concat([X_num, X_cat], axis=1)
if X_proc.shape[1] == 0:
    raise RuntimeError("No usable features after preprocessing.")
print("Processed features shape:", X_proc.shape)

Processed features shape: (1000, 1016)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print("Train/test:", X_train.shape, X_test.shape)


Train/test: (800, 1016) (200, 1016)


In [9]:
log = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
log.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [10]:
y_log = log.predict(X_test)
y_rf = rf.predict(X_test)
def metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0)
    }
m_log = metrics(y_test, y_log)
m_rf = metrics(y_test, y_rf)
print("Logistic:", m_log)
print("RandomForest:", m_rf)

Logistic: {'accuracy': 0.75, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
RandomForest: {'accuracy': 0.745, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


In [11]:
best = rf if m_rf["f1"] >= m_log["f1"] else log
best_name = "random_forest" if best is rf else "logistic_regression"
out = Path.cwd() / f"best_model_{best_name}.joblib"
joblib.dump(best, out)
print("Saved best model to", out)

Saved best model to /content/best_model_random_forest.joblib
