In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv("Churn_Modelling.csv")

In [5]:
# For simplicity, let's drop 'RowNumber', 'CustomerId', and 'Surname' as they are unlikely to contribute to churn prediction.
data = data.drop(['RowNumber', 'CustomerId', 'Surname',], axis=1)

In [6]:
# Define features and target variable
X = data.drop('Exited', axis=1)
y = data['Exited']

In [26]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing: one-hot encoding for categorical variables and feature scaling for numerical variables
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender']

numeric_transformer= StandardScaler()
categorical_transformer = OneHotEncoder()

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

In [28]:
# Define the pipeline with preprocessing and Logistic regression model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])


In [29]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [30]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.8035
Confusion Matrix:
 [[1552   55]
 [ 338   55]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.97      0.89      1607
           1       0.50      0.14      0.22       393

    accuracy                           0.80      2000
   macro avg       0.66      0.55      0.55      2000
weighted avg       0.76      0.80      0.76      2000



In [32]:
from sklearn.ensemble import RandomForestClassifier

# Define the pipeline with preprocessing and Random Forest classifier
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Fit the pipeline
pipeline_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = pipeline_rf.predict(X_test)

In [33]:
# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

print("Random Forest Model Accuracy:", accuracy_rf)
print("Confusion Matrix (Random Forest):\n", conf_matrix_rf)
print("Classification Report (Random Forest):\n", classification_rep_rf)

Random Forest Model Accuracy: 0.849
Confusion Matrix (Random Forest):
 [[1528   79]
 [ 223  170]]
Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      1607
           1       0.68      0.43      0.53       393

    accuracy                           0.85      2000
   macro avg       0.78      0.69      0.72      2000
weighted avg       0.84      0.85      0.84      2000



In [35]:
# Cross-validation with Random Forest
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline_rf, X, y, cv=5)
print("Cross-Validation Scores (Random Forest):", cv_scores)
print("Mean Cross-Validation Score (Random Forest):", cv_scores.mean())

Cross-Validation Scores (Random Forest): [0.846  0.853  0.8495 0.8465 0.854 ]
Mean Cross-Validation Score (Random Forest): 0.8497999999999999


In [36]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the pipeline with preprocessing and Gradient Boosting classifier
pipeline_gb = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))])

# Fit the pipeline
pipeline_gb.fit(X_train, y_train)

# Make predictions
y_pred_gb = pipeline_gb.predict(X_test)

# Evaluate the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
classification_rep_gb = classification_report(y_test, y_pred_gb)

print("Gradient Boosting Model Accuracy:", accuracy_gb)
print("Confusion Matrix (Gradient Boosting):\n", conf_matrix_gb)
print("Classification Report (Gradient Boosting):\n", classification_rep_gb)

# Cross-validation with Gradient Boosting
cv_scores_gb = cross_val_score(pipeline_gb, X, y, cv=5)
print("Cross-Validation Scores (Gradient Boosting):", cv_scores_gb)
print("Mean Cross-Validation Score (Gradient Boosting):", cv_scores_gb.mean())

Gradient Boosting Model Accuracy: 0.855
Confusion Matrix (Gradient Boosting):
 [[1538   69]
 [ 221  172]]
Classification Report (Gradient Boosting):
               precision    recall  f1-score   support

           0       0.87      0.96      0.91      1607
           1       0.71      0.44      0.54       393

    accuracy                           0.85      2000
   macro avg       0.79      0.70      0.73      2000
weighted avg       0.84      0.85      0.84      2000

Cross-Validation Scores (Gradient Boosting): [0.851  0.858  0.8465 0.8605 0.855 ]
Mean Cross-Validation Score (Gradient Boosting): 0.8542000000000002


In [37]:
from sklearn.svm import SVC

# Define the pipeline with preprocessing and SVC classifier
pipeline_svc = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(kernel='rbf', random_state=42))])

# Fit the pipeline
pipeline_svc.fit(X_train, y_train)

# Make predictions
y_pred_svc = pipeline_svc.predict(X_test)

# Evaluate the SVC model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
conf_matrix_svc = confusion_matrix(y_test, y_pred_svc)
classification_rep_svc = classification_report(y_test, y_pred_svc)

print("SVC Model Accuracy:", accuracy_svc)
print("Confusion Matrix (SVC):\n", conf_matrix_svc)
print("Classification Report (SVC):\n", classification_rep_svc)

# Cross-validation with SVC
cv_scores_svc = cross_val_score(pipeline_svc, X, y, cv=5)
print("Cross-Validation Scores (SVC):", cv_scores_svc)
print("Mean Cross-Validation Score (SVC):", cv_scores_svc.mean())


SVC Model Accuracy: 0.847
Confusion Matrix (SVC):
 [[1546   61]
 [ 245  148]]
Classification Report (SVC):
               precision    recall  f1-score   support

           0       0.86      0.96      0.91      1607
           1       0.71      0.38      0.49       393

    accuracy                           0.85      2000
   macro avg       0.79      0.67      0.70      2000
weighted avg       0.83      0.85      0.83      2000

Cross-Validation Scores (SVC): [0.8505 0.848  0.8445 0.853  0.8475]
Mean Cross-Validation Score (SVC): 0.8487
