In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline


In [61]:
df = pd.read_csv(r'D:\OneDrive\Documents\IIT\STAGE 02\Machine Learning\CW\data\processed\telco_data_cleaned.csv') 
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_0–12,tenure_group_13–24,tenure_group_25–36,tenure_group_37–48,tenure_group_49–60,tenure_group_61–72
0,0,29.85,29.85,0,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,1,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.3,1840.75,0,False,True,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,0,70.7,151.65,1,True,False,True,False,True,False,...,False,False,True,False,True,False,False,False,False,False


In [62]:
# split features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

In [63]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [64]:
# SMOTE for balancing classes
smote = SMOTEENN()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [65]:
# Scale numerical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns  # Identify numerical columns

scaler = StandardScaler()
X_resampled[numerical_cols] = scaler.fit_transform(X_resampled[numerical_cols]) # Fit and transform on training data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [66]:
# Decision Tree Classifier
dt_classifier= DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=8, min_samples_split=10, random_state=100)
dt_classifier.fit(X_resampled, y_resampled) # Fit on resampled training data

In [67]:
# Predictions
y_pred_dt = dt_classifier.predict(X_test)

In [68]:
# Evaluate model
dt_classifier.score(X_test, y_test)
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Accuracy: 0.7505330490405118
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.76      0.82      1033
           1       0.52      0.72      0.61       374

    accuracy                           0.75      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.79      0.75      0.76      1407

Confusion Matrix:
 [[786 247]
 [104 270]]


In [69]:
# Hyperparameter tuning with GridSearchCV
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('smoteenn', SMOTEENN()),
    ('model', DecisionTreeClassifier())
])

In [70]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'model__max_depth': [3, 5, 7, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 5],
    'model__criterion': ['gini', 'entropy']
}

In [71]:
# Perform Grid Search
grid_dt = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring='f1',         # Focus on minority class performance
    n_jobs=-1
)

grid_dt.fit(X_resampled, y_resampled)



In [72]:
# Get best  model and make predictions
best_dt = grid_dt.best_estimator_
y_pred = best_dt.predict(X_test)

In [73]:
# Metrics for fine-tuned model
print("Best DT parameters:", grid_dt.best_params_)
print("Best CV F1-score:", grid_dt.best_score_)

Best DT parameters: {'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Best CV F1-score: 0.9376005165236816


In [74]:
# Evaluate fine-tuned model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.7356076759061834
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.73      0.80      1033
           1       0.50      0.76      0.61       374

    accuracy                           0.74      1407
   macro avg       0.70      0.74      0.70      1407
weighted avg       0.79      0.74      0.75      1407

Confusion Matrix:
 [[750 283]
 [ 89 285]]


## Neural Network

In [75]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from kerastuner.tuners import Hyperband


In [76]:
# Define model
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_resampled.shape[1],)),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # sigmoid for binary classification
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [77]:
# Compile model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [78]:
# Train model with early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
history = model.fit(X_resampled, y_resampled,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=32,
                    callbacks=[early_stop])


Epoch 1/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8154 - loss: 0.4170 - val_accuracy: 0.9066 - val_loss: 0.2689
Epoch 2/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8834 - loss: 0.3008 - val_accuracy: 0.9076 - val_loss: 0.2267
Epoch 3/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8959 - loss: 0.2709 - val_accuracy: 0.9204 - val_loss: 0.2169
Epoch 4/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9039 - loss: 0.2589 - val_accuracy: 0.9087 - val_loss: 0.2348
Epoch 5/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9092 - loss: 0.2413 - val_accuracy: 0.9490 - val_loss: 0.1430
Epoch 6/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9070 - loss: 0.2358 - val_accuracy: 0.9140 - val_loss: 0.2368
Epoch 7/50
[1m118/118[0m 

In [79]:
# Evaluate model
y_pred_proba = model.predict(X_test)
# ---- 2. Convert probabilities to class labels (0 or 1) ----
y_pred = (y_pred_proba > 0.5).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test Accuracy: 0.7448471926083866

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.75      0.81      1033
           1       0.51      0.72      0.60       374

    accuracy                           0.74      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.78      0.74      0.76      1407


Confusion Matrix:
 [[777 256]
 [103 271]]


In [80]:
# Hyperparameter tuning with Keras Tuner
def build_model(hp):
    model = Sequential()

    # Number of neurons in first layer
    model.add(Dense(
        units = hp.Int('units_1', min_value=16, max_value=128, step=16),
        activation='relu',
        input_shape=(X_resampled.shape[1],)
    ))
    
    # Optional dropout
    model.add(Dropout(hp.Float('dropout_1', 0.0, 0.5, step=0.1)))

    # Second hidden layer (tunable units)
    model.add(Dense(
        units = hp.Int('units_2', min_value=16, max_value=128, step=16),
        activation='relu'
    ))
    model.add(Dropout(hp.Float('dropout_2', 0.0, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Tune learning rate
    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model


In [81]:
# Initialize Keras Tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=20,
    factor=3,
    directory='tuner_results',
    project_name='churn_nn'
)


Reloading Tuner from tuner_results\churn_nn\tuner0.json


In [82]:
# hyperparameter search
tuner.search(
    X_resampled, y_resampled,
    epochs=20,
    validation_split=0.2
)



In [86]:
best_hp = tuner.get_best_hyperparameters(1)[0]
print(best_hp.values)



{'units_1': 80, 'dropout_1': 0.0, 'units_2': 64, 'dropout_2': 0.30000000000000004, 'learning_rate': 0.002952274106645383, 'tuner/epochs': 7, 'tuner/initial_epoch': 3, 'tuner/bracket': 2, 'tuner/round': 1, 'tuner/trial_id': '0008'}


In [87]:
# Get the best model
best_model = tuner.get_best_models(1)[0]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [88]:
# Retrain best model
history = best_model.fit(
    X_resampled, y_resampled,
    validation_split=0.2,
    epochs=50,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)]
)


Epoch 1/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9360 - loss: 0.1770 - val_accuracy: 0.9904 - val_loss: 0.0359
Epoch 2/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9365 - loss: 0.1643 - val_accuracy: 0.9873 - val_loss: 0.0453
Epoch 3/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9432 - loss: 0.1529 - val_accuracy: 0.9777 - val_loss: 0.0434
Epoch 4/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9458 - loss: 0.1448 - val_accuracy: 0.9798 - val_loss: 0.0546
Epoch 5/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9485 - loss: 0.1389 - val_accuracy: 0.9915 - val_loss: 0.0286
Epoch 6/50
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9503 - loss: 0.1341 - val_accuracy: 0.9926 - val_loss: 0.0271
Epoch 7/50
[1m118/118[0m 

In [90]:
test_loss, test_auc = best_model.evaluate(X_test, y_test)
print("Test AUC:", test_auc)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7626 - loss: 0.8248
Test AUC: 0.7626155018806458


In [89]:
# Evaluate best model
y_pred_proba = best_model.predict(X_test)
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)
# ---- 2. Convert probabilities to class labels (0 or 1) ----
y_pred = (y_pred_proba > 0.5).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7626 - loss: 0.8248 
Test Accuracy: 0.7626155018806458
Test Accuracy: 0.7626154939587776

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      1033
           1       0.54      0.65      0.59       374

    accuracy                           0.76      1407
   macro avg       0.70      0.73      0.71      1407
weighted avg       0.78      0.76      0.77      1407


Confusion Matrix:
 [[830 203]
 [131 243]]
