In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("tel_churn.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,1 - 12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0,25 - 36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,1 - 12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,37 - 48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1,1 - 12


In [3]:
x = data.drop('Churn',axis=1)
x.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1 - 12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,25 - 36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1 - 12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,37 - 48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1 - 12


In [4]:
y = data["Churn"]
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [5]:
object_columns = [col for col in data.columns if data[col].dtype == 'O']

print(object_columns)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group']


# With Imbalanced Data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [7]:
categorical_columns = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group'
]



column_transformer = ColumnTransformer(
    transformers=[('ohe', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)],
    remainder='passthrough'  
)

models = {
    'Logistic Regression': (LogisticRegression(max_iter=500), {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs', 'liblinear']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'classifier__max_depth': [5, 10, 20, None],
        'classifier__criterion': ['gini', 'entropy']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [5, 10, 20, None]
    }),
    'KNN': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance']
    })
}

X = data.drop(columns=['Churn'])  
y = data['Churn']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


for model_name, (model, param_grid) in models.items():
    pipe = Pipeline([
        ('column_transformer', column_transformer),
        ('classifier', model)  
    ])
    
   
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
   
    print(f"\n{model_name} Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} Best Accuracy: {grid_search.best_score_:.4f}")


Logistic Regression Best Parameters: {'classifier__C': 0.1, 'classifier__solver': 'lbfgs'}
Logistic Regression Best Accuracy: 0.8055

Decision Tree Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5}
Decision Tree Best Accuracy: 0.7868

Random Forest Best Parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}
Random Forest Best Accuracy: 0.8027

KNN Best Parameters: {'classifier__n_neighbors': 9, 'classifier__weights': 'uniform'}
KNN Best Accuracy: 0.7790


# Apply SMOTE To Reduce Imbalancing

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTEENN  
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


print("Columns in dataset:", data.columns)

numeric_features = ['MonthlyCharges', 'TotalCharges']
categorical_columns = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group'
]



X = data.drop(columns=['Churn'])  
y = data['Churn']  

column_transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_columns),
        ('scaler', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)
X_transformed = column_transformer.fit_transform(X)  

smote_enn = SMOTEENN()
X_resampled, y_resampled = smote_enn.fit_resample(X_transformed, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

models = {
    'Logistic Regression': (LogisticRegression(max_iter=500), {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs', 'liblinear']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'classifier__max_depth': [5, 10, 20, None],
        'classifier__criterion': ['gini', 'entropy']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [5, 10, 20, None]
    }),
    'KNN': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance']
    })
}
best_model = None
best_test_accuracy = 0
best_model_name = ""

# Iterate over models and apply GridSearchCV
for model_name, (model, param_grid) in models.items():
    pipe = Pipeline([
        ('classifier', model)  # The model step should be named 'classifier'
    ])
    
    # Apply GridSearchCV
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_test_pred = grid_search.best_estimator_.predict(X_test)
    
    # Compute test accuracy
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Print results
    print(f"\n{model_name} Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    print(f"{model_name} Test Accuracy: {test_accuracy:.4f}")
    
    # Save the best model
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        best_model = grid_search.best_estimator_
        best_model_name = model_name

# Print the best model overall
print(f"\n✅ Best Model: {best_model_name} with Test Accuracy: {best_test_accuracy:.4f}")

Columns in dataset: Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'tenure_group'],
      dtype='object')

Logistic Regression Best Parameters: {'classifier__C': 10, 'classifier__solver': 'lbfgs'}
Logistic Regression Best Cross-Validation Accuracy: 0.9151
Logistic Regression Test Accuracy: 0.9165

Decision Tree Best Parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None}
Decision Tree Best Cross-Validation Accuracy: 0.9373
Decision Tree Test Accuracy: 0.9451

Random Forest Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
Random Forest Best Cross-Validation Accuracy: 0.9621
Random Forest Test Accuracy: 0.9753

KNN Best Parameters: {'classifier__n_ne

# Conclusion
## 1. The Random Forest model achieved a 98% accuracy, making it the best-performing model.
## 2. The optimal hyperparameters were 200 estimators with no depth restriction, indicating strong generalization.

# Train Model

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the best Random Forest model with found parameters
best_rf = RandomForestClassifier(max_depth=20, n_estimators=200, random_state=42)

# Train the model on resampled data
best_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_rf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.9776
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       549
           1       0.97      0.99      0.98       744

    accuracy                           0.98      1293
   macro avg       0.98      0.98      0.98      1293
weighted avg       0.98      0.98      0.98      1293



# Dumping Model

In [12]:
import pickle

# Save the column transformer
pickle.dump(column_transformer, open("column_transformer.pkl", "wb"))

# Save the best model
pickle.dump(best_rf, open("best_rf.pkl", "wb"))