In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [41]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df.head())
print(df.info())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [42]:
df.drop("customerID", axis=1, inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    if df[col].nunique() <= 2:
        df[col] = LabelEncoder().fit_transform(df[col])
    else:
        df = pd.get_dummies(df, columns=[col])

In [43]:
X = df.drop("Churn", axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Hyperparameter Tuning

# Logistic Regression
log_reg_params = {"C": [0.01, 0.1, 1, 10], "solver": ["liblinear", "saga"]}
log_reg = GridSearchCV(LogisticRegression(max_iter=2000, class_weight="balanced"),
                       log_reg_params, cv=3, scoring="f1", n_jobs=1)
log_reg.fit(X_train, y_train)
print("Best Logistic Regression Params:", log_reg.best_params_)

# Random Forest
rf_params = {"n_estimators": [100, 200, 300],
             "max_depth": [5, 10, None],
             "min_samples_split": [2, 5, 10]}
rf = GridSearchCV(RandomForestClassifier(random_state=42, class_weight="balanced"),
                  rf_params, cv=3, scoring="f1", n_jobs=1)
rf.fit(X_train, y_train)
print("Best Random Forest Params:", rf.best_params_)

# KNN
knn_params = {"n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"]}
knn = GridSearchCV(KNeighborsClassifier(),
                   knn_params, cv=3, scoring="f1", n_jobs=1)
knn.fit(X_train, y_train)
print("Best KNN Params:", knn.best_params_)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

# XGBoost
xgb_params = {"n_estimators": [100, 200],
              "max_depth": [3, 5, 7],
              "learning_rate": [0.01, 0.1, 0.2],
              "subsample": [0.8, 1]}
xgb = GridSearchCV(XGBClassifier(eval_metric="logloss", random_state=42,
                                 scale_pos_weight=(len(y_train[y_train==0]) / len(y_train[y_train==1]))),
                   xgb_params, cv=3, scoring="f1", n_jobs=1)
xgb.fit(X_train, y_train)
print("Best XGBoost Params:", xgb.best_params_)

# Final Evaluation

final_models = {
    "Logistic Regression": log_reg.best_estimator_,
    "Random Forest": rf.best_estimator_,
    "KNN": knn.best_estimator_,
    "Naive Bayes": nb,
    "XGBoost": xgb.best_estimator_
}

results = []
for name, model in final_models.items():
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    results.append([name, acc, prec, rec, f1])

    print(f"\n{name}")
    print(classification_report(y_test, preds, digits=4))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\nFinal Tuned Model Comparison:\n", results_df)

In [45]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
print("\nFinal Comparison:\n", results_df)


Final Comparison:
                  Model  Accuracy  Precision    Recall        F1
0  Logistic Regression  0.754436   0.525830  0.762032  0.622271
1        Random Forest  0.782825   0.608280  0.510695  0.555233
2                  KNN  0.767211   0.563889  0.542781  0.553134
3          Naive Bayes  0.284599   0.264487  0.951872  0.413953
4              XGBoost  0.776437   0.591331  0.510695  0.548063
