In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/label_encoded_telecom_data.csv')

In [3]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the specified columns
df[['TotalCharges', 'MonthlyCharges', 'tenure']] = scaler.fit_transform(df[['TotalCharges', 'MonthlyCharges', 'tenure']])

# Display the first few rows to verify the scaling
print(df[['TotalCharges', 'MonthlyCharges', 'tenure']].head())

   TotalCharges  MonthlyCharges    tenure
0     -0.994194       -1.161694 -1.280248
1     -0.173740       -0.260878  0.064303
2     -0.959649       -0.363923 -1.239504
3     -0.195248       -0.747850  0.512486
4     -0.940457        0.196178 -1.239504


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Define the features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Combine X_train and y_train for resampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separate the majority and minority classes
majority_class = train_data[train_data.Churn == 0]
minority_class = train_data[train_data.Churn == 1]

# Upsample the minority class
minority_upsampled = resample(minority_class, 
                              replace=True,    # sample with replacement
                              n_samples=len(majority_class), # to match majority class
                              random_state=42) # reproducible results

# Combine majority class with upsampled minority class
upsampled_train_data = pd.concat([majority_class, minority_upsampled])

# Separate features and target variable
X_train_resampled = upsampled_train_data.drop('Churn', axis=1)
y_train_resampled = upsampled_train_data['Churn']

# Display the class distribution after resampling
print("Class distribution before resampling:", y_train.value_counts())
print("Class distribution after resampling:", y_train_resampled.value_counts())

Class distribution before resampling: Churn
0    4130
1    1495
Name: count, dtype: int64
Class distribution after resampling: Churn
0    4130
1    4130
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer, classification_report

f1_scorer = make_scorer(f1_score, average='weighted')


In [8]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 6],
    'min_samples_split': [2],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'], 
    'ccp_alpha': [0]  
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

y_pred_dt = grid_search.predict(X_test)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", f1_score(y_train, grid_search.predict(X_train), average='binary'))
print("Best test f1 score: ", f1_score(y_test, y_pred_dt, average='binary'))
print(classification_report(y_test, y_pred_dt))

Best parameters found:  {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best F1 score:  0.5895212966453073
Best test f1 score:  0.5491923641703378
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1033
           1       0.61      0.50      0.55       374

    accuracy                           0.78      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [None]:
X_train_resampled.drop('customerID', axis=1, inplace=True)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

f1_scorer = make_scorer(f1_score, average='weighted')

In [67]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5],
    'min_samples_split': [2],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini'], 
    'ccp_alpha': [0]  
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", f1_score(y_train_resampled, grid_search.predict(X_train_resampled)))
print("Best test f1 score: ", f1_score(y_test, grid_search.predict(X_test.drop('customerID', axis=1)), average='weighted'))

Best parameters found:  {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best F1 score:  0.7847709696609162
Best test f1 score:  0.7519732674654809


In [73]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
rf_param_grid = {
    'n_estimators': [90],
    'max_depth': [3, 5],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'criterion': ['gini']
}

# Initialize GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_classifier, param_grid=rf_param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
rf_grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", rf_grid_search.best_params_)
print("Best F1 score: ", rf_grid_search.best_score_)
print("Best test f1 score: ", f1_score(y_test, rf_grid_search.best_estimator_.predict(X_test.drop('customerID', axis=1)), average='weighted'))   

Best parameters found:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 90}
Best F1 score:  0.7780142260257591
Best test f1 score:  0.7457033163973623


In [74]:
print("Best accuracy = ", rf_grid_search.best_estimator_.score(X_train_resampled, y_train_resampled))

Best accuracy =  0.7854721549636804


In [81]:
from xgboost import XGBClassifier

# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
xgb_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0.0, 0.1],
    }

# Initialize GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=xgb_param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
xgb_grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", xgb_grid_search.best_params_)
print("Best F1 score: ", xgb_grid_search.best_score_)
print("Best test f1 score: ", f1_score(y_test, xgb_grid_search.predict(X_test.drop("customerID", axis=1)), average='weighted'))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1.0}
Best F1 score:  0.8265258624178372
Best test f1 score:  0.7587314890604269


Parameters: { "use_label_encoder" } are not used.



In [83]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)

# Define the parameter grid
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Initialize GridSearchCV
lr_grid_search = GridSearchCV(estimator=lr_classifier, param_grid=lr_param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
lr_grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", lr_grid_search.best_params_)
print("Best F1 score: ", lr_grid_search.best_score_)
print("Best test f1 score: ", f1_score(y_test, lr_grid_search.predict(X_test.drop("customerID", axis = 1)), average='weighted'))

Best parameters found:  {'C': 10, 'solver': 'liblinear'}
Best F1 score:  0.7695450258059281
Best test f1 score:  0.7426288167234645


In [89]:
from sklearn.svm import SVC

# Initialize the SVM classifier
svm_classifier = SVC(random_state=42)

# Define the parameter grid
svm_param_grid = {
    'C': [0.1, 0.5, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize GridSearchCV
svm_grid_search = GridSearchCV(estimator=svm_classifier, param_grid=svm_param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
svm_grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", svm_grid_search.best_params_)
print("Best F1 score: ", svm_grid_search.best_score_)
print("Best train f1 score: ", f1_score(y_test, svm_grid_search.predict(X_test.drop("customerID", axis=1)), average='weighted'))

Best parameters found:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best F1 score:  0.7854515521835455
Best train f1 score:  0.7518961111110036


In [99]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier()

# Define the parameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize GridSearchCV
knn_grid_search = GridSearchCV(estimator=knn_classifier, param_grid=knn_param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)

# Fit GridSearchCV to the resampled training data
knn_grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", knn_grid_search.best_params_)
print("Best F1 score: ", knn_grid_search.best_score_)
print("Best test f1 score: ", f1_score(y_test, knn_grid_search.predict(X_test.drop("customerID", axis=1)), average='weighted'))

Best parameters found:  {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best F1 score:  0.8378627174471571
Best test f1 score:  0.7223742974941906
