In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline


In [235]:
df = pd.read_csv(r'D:\OneDrive\Documents\IIT\STAGE 02\Machine Learning\CW\data\processed\telco_data_cleaned.csv') 
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_0–12,tenure_group_13–24,tenure_group_25–36,tenure_group_37–48,tenure_group_49–60,tenure_group_61–72
0,0,29.85,29.85,0,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,1,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.3,1840.75,0,False,True,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,0,70.7,151.65,1,True,False,True,False,True,False,...,False,False,True,False,True,False,False,False,False,False


In [236]:
# split features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

In [237]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [238]:
# SMOTE for balancing classes
smote = SMOTEENN()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [239]:
# Scale numerical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns  # Identify numerical columns

scaler = StandardScaler()
X_resampled[numerical_cols] = scaler.fit_transform(X_resampled[numerical_cols]) # Fit and transform on training data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [None]:
# Decision Tree Classifier
dt_classifier= DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=8, min_samples_split=10, random_state=100)
dt_classifier.fit(X_resampled, y_resampled) # Fit on resampled training data

In [241]:
# Predictions
y_pred_dt = dt_classifier.predict(X_test)

In [243]:
# Evaluate model
dt_classifier.score(X_test, y_test)
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Accuracy: 0.7526652452025586
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.76      0.82      1033
           1       0.53      0.73      0.61       374

    accuracy                           0.75      1407
   macro avg       0.71      0.75      0.71      1407
weighted avg       0.79      0.75      0.76      1407

Confusion Matrix:
 [[786 247]
 [101 273]]


In [244]:
# Hyperparameter tuning with GridSearchCV
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('smoteenn', SMOTEENN()),
    ('model', DecisionTreeClassifier())
])

In [245]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'model__max_depth': [3, 5, 7, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 5],
    'model__criterion': ['gini', 'entropy']
}

In [246]:
# Perform Grid Search
grid_dt = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring='f1',         # Focus on minority class performance
    n_jobs=-1
)

grid_dt.fit(X_resampled, y_resampled)



In [None]:
# Get best  model and make predictions
best_dt = grid_dt.best_estimator_
y_pred = best_dt.predict(X_test)

In [None]:
# Metrics for fine-tuned model
print("Best DT parameters:", grid_dt.best_params_)
print("Best CV F1-score:", grid_dt.best_score_)

Best DT parameters: {'model__criterion': 'entropy', 'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Best CV F1-score: 0.9389834912283156


In [None]:
# Evaluate fine-tuned model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.7455579246624022
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.75      0.81      1033
           1       0.51      0.74      0.61       374

    accuracy                           0.75      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.79      0.75      0.76      1407

Confusion Matrix:
 [[772 261]
 [ 97 277]]


## Neural Network