In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


df = pd.read_csv("customers_.csv")

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


(None,
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
 0  7590-VHVEG  Female              0     Yes         No       1           No   
 1  5575-GNVDE    Male              0      No         No      34          Yes   
 2  3668-QPYBK    Male              0      No         No       2          Yes   
 3  7795-CFOCW    Male              0      No         No      45           No   
 4  9237-HQITU  Female              0      No         No       2          Yes   
 
       MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
 0  No phone service             DSL             No  ...               No   
 1                No             DSL            Yes  ...              Yes   
 2                No             DSL            Yes  ...               No   
 3  No phone service             DSL            Yes  ...              Yes   
 4                No     Fiber optic             No  ...               No   
 
   TechSupport StreamingTV StreamingMovie

In [None]:
df.head()

In [2]:


df.drop(columns=['customerID'], inplace=True)

# object to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(0)  # Fixed!

# Encoding
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Label Encoding
categorical_cols = df.select_dtypes(include=["object"]).columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


((5634, 19), (1409, 19))

In [None]:

X = df.drop(columns=["Churn"])
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

X_train_scaled.shape, X_test_scaled.shape


##### Training

In [5]:

def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
    }, y_pred

# LR
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_results, y_pred_log = train_evaluate_model(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

# RF
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_results, y_pred_rf = train_evaluate_model(rf_model, X_train, y_train, X_test, y_test)

# XGBOOST
xgb_model = XGBClassifier(eval_metric="logloss", random_state=42)
xgb_results, y_pred_xgb = train_evaluate_model(xgb_model, X_train, y_train, X_test, y_test)

log_results, rf_results, xgb_results


({'Accuracy': 0.7984386089425124,
  'Precision': 0.640625,
  'Recall': 0.5481283422459893,
  'F1 Score': 0.590778097982709},
 {'Accuracy': 0.794889992902768,
  'Precision': 0.6430976430976431,
  'Recall': 0.5106951871657754,
  'F1 Score': 0.5692995529061102},
 {'Accuracy': 0.7835344215755855,
  'Precision': 0.610223642172524,
  'Recall': 0.5106951871657754,
  'F1 Score': 0.5560407569141194})

###### Hyper parameter tuning

In [12]:


xgb_param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0],
}

# GridSearchCV
xgb_grid = GridSearchCV(XGBClassifier(eval_metric="logloss", random_state=42), 
                        xgb_param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)

xgb_grid.fit(X_train, y_train)

xgb_grid.best_params_, xgb_grid.best_score_


Fitting 3 folds for each of 16 candidates, totalling 48 fits


({'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8},
 0.8047568335108272)

In [13]:

xgb_param_dist = {
    "n_estimators": [50, 80],
    "max_depth": [3, 4],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8],
}

# RandomizedSearchCV 
xgb_random = RandomizedSearchCV(
    XGBClassifier(eval_metric="logloss", random_state=42),
    xgb_param_dist, 
    cv=3, 
    scoring="accuracy", 
    n_jobs=-1, 
    n_iter=4, 
    verbose=1
)

xgb_random.fit(X_train, y_train)

print("Best Parameters:", xgb_random.best_params_)
print("Best Score:", xgb_random.best_score_)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 80, 'max_depth': 3, 'learning_rate': 0.1}
Best Score: 0.806354277600284
