In [16]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


### MLP Classifier

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
data = pd.read_csv("churn.csv")

# Preprocess the data (One-hot encode categorical variables, remove unnecessary columns, etc.)
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)  # Drop unnecessary columns

# One-hot encode categorical variables
categorical_features = ["Geography", "Gender"]
one_hot_encoder = OneHotEncoder(drop="first")  # Use 'drop="first"' to avoid the dummy variable trap
column_transformer = ColumnTransformer(transformers=[("encoder", one_hot_encoder, categorical_features)], remainder="passthrough")
X = column_transformer.fit_transform(data.drop("Exited", axis=1))
y = data["Exited"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
mlp.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = mlp.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Accuracy: ", accuracy_score(y_test, predictions))


[[1533   74]
 [ 197  196]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1607
           1       0.73      0.50      0.59       393

    accuracy                           0.86      2000
   macro avg       0.81      0.73      0.76      2000
weighted avg       0.85      0.86      0.85      2000

Accuracy:  0.8645


#### The confusion matrix is a table that describes the performance of a classification model. It compares the predicted labels with the true labels. In this case:

1526 true negatives (TN): customers correctly predicted as not churning

81 false positives (FP): customers incorrectly predicted as churning

212 false negatives (FN): customers incorrectly predicted as not churning

181 true positives (TP): customers correctly predicted as churning


#### Accuracy Score
The accuracy score is 0.8535, which means the classifier correctly predicted the class for 85.35% of the test instances.

### RandomForrestClassifier_1

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
data = pd.read_csv("churn.csv")

# Preprocess the data (One-hot encode categorical variables, remove unnecessary columns, etc.)
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)  # Drop unnecessary columns

# One-hot encode categorical variables
categorical_features = ["Geography", "Gender"]
one_hot_encoder = OneHotEncoder(drop="first")  # Use 'drop="first"' to avoid the dummy variable trap
column_transformer = ColumnTransformer(transformers=[("encoder", one_hot_encoder, categorical_features)], remainder="passthrough")
X = column_transformer.fit_transform(data.drop("Exited", axis=1))
y = data["Exited"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = rf.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Accuracy: ", accuracy_score(y_test, predictions))


[[1552   55]
 [ 204  189]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.48      0.59       393

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000

Accuracy:  0.8705


#### The confusion matrix compares the predicted labels with the true labels. In this case:

1552 true negatives (TN): customers correctly predicted as not churning

55 false positives (FP): customers incorrectly predicted as churning

204 false negatives (FN): customers incorrectly predicted as not churning

189 true positives (TP): customers correctly predicted as churning

#### Accuracy: 
The proportion of correct predictions over the total number of predictions. In this case, the accuracy is 0.8705, meaning the classifier made correct predictions for 87.05% of the test instances.

### RandomForestClassifier_2 with hyperparameter tuning and SMOTE for handling class imbalance

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the dataset
data = pd.read_csv("churn.csv")

# Preprocess the data (One-hot encode categorical variables, remove unnecessary columns, etc.)
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)  # Drop unnecessary columns

# One-hot encode categorical variables
categorical_features = ["Geography", "Gender"]
one_hot_encoder = OneHotEncoder(drop="first")  # Use 'drop="first"' to avoid the dummy variable trap
column_transformer = ColumnTransformer(transformers=[("encoder", one_hot_encoder, categorical_features)], remainder="passthrough")
X = column_transformer.fit_transform(data.drop("Exited", axis=1))
y = data["Exited"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform hyperparameter tuning using grid search
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Make predictions and evaluate the model
predictions = best_rf.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Accuracy: ", accuracy_score(y_test, predictions))


[[1532   75]
 [ 198  195]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1607
           1       0.72      0.50      0.59       393

    accuracy                           0.86      2000
   macro avg       0.80      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000

Accuracy:  0.8635


#### The confusion matrix compares the predicted labels with the true labels. In this case:

1532 true negatives (TN): customers correctly predicted as not churning

75 false positives (FP): customers incorrectly predicted as churning

198 false negatives (FN): customers incorrectly predicted as not churning

195 true positives (TP): customers correctly predicted as churning

#### Accuracy:
The accuracy score is 0.8635, which means the classifier correctly predicted the class for 86.35% of the test instances.

##### MLPClassifier:

Accuracy: 0.853

True positives: 186

True negatives: 1520

False positives: 87

False negatives: 207

##### RandomForrestClassifier_1:

Accuracy: 0.8705

True positives: 189

True negatives: 1552

False positives: 55

False negatives: 204

While the RandomForestClassifier_2 has a slightly lower accuracy (0.8635) compared to the RandomForestClassifier_1 (0.8705), it has a better balance between precision and recall for the positive class (churners). The RandomForestClassifier_2 has a precision of 0.72 and recall of 0.50 for the positive class, while the RandomForestClassifier_1 has a precision of 0.77 and recall of 0.48.