In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('../data/label_encoded_telecom_data.csv')

In [7]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the specified columns
df[['TotalCharges', 'MonthlyCharges', 'tenure']] = scaler.fit_transform(df[['TotalCharges', 'MonthlyCharges', 'tenure']])

# Display the first few rows to verify the scaling
print(df[['TotalCharges', 'MonthlyCharges', 'tenure']].head())

   TotalCharges  MonthlyCharges    tenure
0     -0.994194       -1.161694 -1.280248
1     -0.173740       -0.260878  0.064303
2     -0.959649       -0.363923 -1.239504
3     -0.195248       -0.747850  0.512486
4     -0.940457        0.196178 -1.239504


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df.drop('Churn', axis=1)
y = df['Churn']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE()

# Fit and apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Combine the resampled features and target variable into a new DataFrame
df_train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
df_train_resampled['Churn'] = y_train_resampled

# Display the class distribution after resampling
print(df_train_resampled['Churn'].value_counts())


Churn
1    4130
0    4130
Name: count, dtype: int64


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1,5, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train_resampled.drop("customerID", axis=1), y_train_resampled)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Predict on the test set
y_pred = grid_search.predict(X_test.drop("customerID", axis=1))

# Calculate and print the F1 score on the test set
test_f1_score = f1_score(y_test, y_pred)
print("Best test F1 score: ", test_f1_score)
print("Best recall score: ", )

Best parameters found:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best cross-validation score:  0.814932487512501
Best test F1 score:  0.5787139689578714


In [15]:
from imblearn.over_sampling import ADASYN

# Remove the customerID column
X_train_no_id = X_train.drop('customerID', axis=1)

# Initialize ADASYN
adasyn = ADASYN()

# Fit and apply ADASYN to the training data
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_no_id, y_train)

# Combine the resampled features and target variable into a new DataFrame
df_train_adasyn = pd.DataFrame(X_train_adasyn, columns=X_train_no_id.columns)
df_train_adasyn['Churn'] = y_train_adasyn

# Display the class distribution after resampling
print(df_train_adasyn['Churn'].value_counts())

Churn
1    4187
0    4130
Name: count, dtype: int64


In [16]:
# Initialize the Decision Tree classifier
dt_adasyn = DecisionTreeClassifier(random_state=42)

# Initialize Grid Search with cross-validation
grid_search_adasyn = GridSearchCV(estimator=dt_adasyn, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1')

# Fit the grid search to the ADASYN resampled data
grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search_adasyn.best_params_)
print("Best cross-validation score: ", grid_search_adasyn.best_score_)

# Predict on the test set
y_pred_adasyn = grid_search_adasyn.predict(X_test.drop("customerID", axis=1))

# Calculate and print the F1 score on the test set
test_f1_score_adasyn = f1_score(y_test, y_pred_adasyn)
print("Best test F1 score: ", test_f1_score_adasyn)

Best parameters found:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best cross-validation score:  0.7783539823326457
Best test F1 score:  0.5537806176783813


In [17]:
# Perform grid search cross-validation on the original training data
grid_search_original = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1')
grid_search_original.fit(X_train_no_id, y_train)

# Get the best parameters from the grid search
best_params = grid_search_original.best_params_
print("Best parameters found from original data: ", best_params)

# Initialize the Decision Tree classifier with the best parameters
dt_best = DecisionTreeClassifier(**best_params, random_state=42)

# Fit the classifier to the oversampled data
dt_best.fit(X_train_adasyn, y_train_adasyn)

# Predict on the test set
y_pred_best = dt_best.predict(X_test.drop("customerID", axis=1))

# Calculate and print the F1 score on the test set
test_f1_score_best = f1_score(y_test, y_pred_best)
print("Test F1 score with best parameters: ", test_f1_score_best)

Best parameters found from original data:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Test F1 score with best parameters:  0.5828460038986355
