In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/label_encoded_telecom_data.csv')

from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the specified columns
df[['TotalCharges', 'MonthlyCharges', 'tenure']] = scaler.fit_transform(df[['TotalCharges', 'MonthlyCharges', 'tenure']])

# Display the first few rows to verify the scaling
print(df[['TotalCharges', 'MonthlyCharges', 'tenure']].head())

   TotalCharges  MonthlyCharges    tenure
0     -0.994194       -1.161694 -1.280248
1     -0.173740       -0.260878  0.064303
2     -0.959649       -0.363923 -1.239504
3     -0.195248       -0.747850  0.512486
4     -0.940457        0.196178 -1.239504


In [43]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

# Separate the features and the target variable
X = df.drop(['Churn', 'customerID'], axis=1)
y = df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the SMOTEENN object
smote_enn = SMOTEENN(random_state=42, enn=EditedNearestNeighbours(n_neighbors=4, n_jobs=-1), sampling_strategy='minority')

# Fit and resample the training data
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Convert the resampled data back to a DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Churn'] = y_resampled

# Display the class distribution after resampling
print(df_resampled['Churn'].value_counts())

Churn
0    4130
1    2470
Name: count, dtype: int64


In [55]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, roc_auc_score, recall_score

scorer = make_scorer(f1_score, average='binary', pos_label = 1)

# Initialize the Decision Tree classifier
dt_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameters and their values for tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with the classifier, parameter grid, and scoring metric
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1)

# Fit the model to the resampled data
grid_search.fit(X_resampled, y_resampled)

# Get the best estimator
best_dt_classifier = grid_search.best_estimator_

# Predict on the original data
y_pred = best_dt_classifier.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

print(classification_report(best_dt_classifier.predict(X_train), y_train))

print(f1_score(y_test, y_pred, average='binary', pos_label=1))

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1033
           1       0.62      0.58      0.60       374

    accuracy                           0.79      1407
   macro avg       0.74      0.72      0.73      1407
weighted avg       0.79      0.79      0.79      1407

              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4278
           1       0.66      0.74      0.70      1347

    accuracy                           0.85      5625
   macro avg       0.79      0.81      0.80      5625
weighted avg       0.85      0.85      0.85      5625

0.5983379501385041
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
