In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier

In [6]:
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

print("Dataset Loaded Successfully!\n")
print(df.head())

Dataset Loaded Successfully!

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Stream

In [7]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan).astype(float)
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [8]:
df = df.drop('customerID', axis=1)

In [9]:
cat_cols = df.select_dtypes(include=['object']).columns

In [10]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [11]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Scale numerical values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier()
ada = AdaBoostClassifier(n_estimators=150, random_state=42)
xgb = XGBClassifier(learning_rate=0.05, max_depth=5, n_estimators=200)

# Voting Classifier (Soft Voting)
voting = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('gb', gb),
        ('ada', ada),
        ('xgb', xgb)
    ],
    voting='soft'
)


In [13]:
models = {
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "AdaBoost": ada,
    "XGBoost": xgb,
    "Voting Ensemble": voting
}

print("\n----- Training Models -----\n")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)



----- Training Models -----

Training Random Forest...
Training Gradient Boosting...
Training AdaBoost...
Training XGBoost...
Training Voting Ensemble...


In [14]:
print("\n----- Model Evaluation Results -----\n")
for name, model in models.items():
    print(f"ðŸ”¹ {name} Results:")
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("--------------------------------------------------\n")


----- Model Evaluation Results -----

ðŸ”¹ Random Forest Results:
Accuracy: 0.7984386089425124
Confusion Matrix:
 [[946  90]
 [194 179]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.67      0.48      0.56       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.71      1409
weighted avg       0.79      0.80      0.79      1409

--------------------------------------------------

ðŸ”¹ Gradient Boosting Results:
Accuracy: 0.8055358410220014
Confusion Matrix:
 [[941  95]
 [179 194]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1036
           1       0.67      0.52      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.80      0.81      0.80      1409

-----

In [15]:
def predict_new_customer(data_dict):
    # Convert dictionary â†’ DataFrame
    new_df = pd.DataFrame([data_dict])

    # Encode using same label encoder
    for col in new_df.columns:
        if col in cat_cols:
            new_df[col] = le.fit_transform(new_df[col])

    # Scale using earlier scaler
    new_scaled = scaler.transform(new_df)

    # Predict using the Voting Ensemble
    pred = voting.predict(new_scaled)[0]
    prob = voting.predict_proba(new_scaled)[0][1]

    print("Prediction:", "Churn" if pred == 1 else "Not Churn")
    print("Churn Probability:", prob)

In [17]:
print("\n----- New Customer Prediction Example -----\n")

sample_customer = {
    "gender": "Female",
    "SeniorCitizen": 0,
    "Partner": "Yes",
    "Dependents": "No",
    "tenure": 12,
    "PhoneService": "Yes",
    "MultipleLines": "No",
    "InternetService": "Fiber optic",
    "OnlineSecurity": "No",
    "OnlineBackup": "Yes",
    "DeviceProtection": "No",
    "TechSupport": "No",
    "StreamingTV": "Yes",
    "StreamingMovies": "Yes",
    "Contract": "Month-to-month",
    "PaperlessBilling": "Yes",
    "PaymentMethod": "Electronic check",
    "MonthlyCharges": 85.5,
    "TotalCharges": 1020.0
}
predict_new_customer(sample_customer)


----- New Customer Prediction Example -----

Prediction: Not Churn
Churn Probability: 0.4018222522273377
