In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
data = pd.read_csv("churn_data.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data = data.rename(columns={"Churn?":"Churn"})

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=['State'], prefix=['State'])

In [None]:
data.head()

In [None]:
data = data.drop("Phone",axis=1)
data.head()

In [None]:
boolean_mapping = {'no': 0, 'yes': 1}


In [None]:
data["Int'l Plan"] = data["Int'l Plan"].map(boolean_mapping)

In [None]:
data["VMail Plan"] = data["VMail Plan"].map(boolean_mapping)

In [None]:
data["Churn"].value_counts()

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
data["Churn"] = label_encoder.fit_transform(data['Churn'])
data["Churn"].value_counts()

In [None]:
X = data.drop("Churn",axis=1)
y = data["Churn"]
# Apply SMOTE to oversample the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)
y.value_counts()

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(),
}


In [None]:
grid = {
    "Logistic Regression": {"C": [0.1, 1, 10, 100]},
    "K-Nearest Neighbors": {"n_neighbors": [3, 5, 7, 9]},
    "Naive Bayes": {},
    "Decision Tree": {"max_depth": [None, 10, 20, 30]},
    "Random Forest": {"n_estimators": [10, 50, 100, 200], "max_depth": [None, 10, 20, 30]},
    "Support Vector Machine": {"kernel": ["linear","rbf"], "C": [0.1, 1, 10, 100]},
    "Gradient Boosting": {"n_estimators": [10, 50, 100, 200], "learning_rate": [0.1, 0.01, 0.001]},
    "Neural Network": {
        "hidden_layer_sizes": [(1024, 512, 256, 128, 64, 32)], "max_iter": [500, 1000,5000,10000]},
}

In [None]:
def perform_classification(X_train,X_test,y_train,y_test,selected_model):
    
    try:
        

        if selected_model in models:  # Check if selected_model is valid
            model_instance = models[selected_model]
            grid_search = GridSearchCV(model_instance, grid[selected_model], cv=5)
            grid_search.fit(X_train, y_train)
            best_params = grid_search.best_params_
            model_instance.set_params(**best_params)
            model_instance.fit(X_train, y_train)

            
            # Make predictions
            y_pred = model_instance.predict(X_test)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)

            # Generate classification report
            report = classification_report(y_test, y_pred,output_dict=True)

             # Create a confusion matrix
            cm = confusion_matrix(y_test, y_pred)

            parameters =  [selected_model, best_params, accuracy, report, cm]

            return  parameters
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
def feature_selection(X,y,k ="all"):
     # Select features and target column
        # Split the dataset into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit and transform the scaler on training data
        X_train_scaled = scaler.fit_transform(X_train)

        # Transform the test data using the same scaler
        X_test_scaled = scaler.transform(X_test)

        # Initialize SelectKBest with the scoring function (f_classif for classification)
        selector = SelectKBest(score_func=f_classif, k=k)

        # Fit the selector to your training data and transform the features
        X_train_selected = selector.fit_transform(X_train_scaled, y_train)
        X_test_selected = selector.transform(X_test_scaled)
        return X_train_selected, X_test_selected, y_train, y_test
    

In [None]:
X_train,X_test,y_train,y_test = feature_selection(X,y)

In [None]:
model_performance = {}
performance = perform_classification(X_train,X_test,y_train,y_test,"Logistic Regression")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
import warnings
# Suppress FutureWarnings related to the mode function
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.neighbors")
performance = perform_classification(X_train,X_test,y_train,y_test,"K-Nearest Neighbors")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Naive Bayes")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Decision Tree")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Random Forest")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Support Vector Machine")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()

In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Gradient Boosting")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()


In [None]:
performance = perform_classification(X_train,X_test,y_train,y_test,"Neural Network")

if performance is not None:
    model_performance.update({performance[0]: f'{performance[2]:.4f}', f"{performance[0]} best_params": performance[1]})
    print(f"**Model:** {performance[0]}")
    print(f"**Best Hyperparameters:** {performance[1]}")
    print(f"**Accuracy:** {performance[2]:.2f}")
    classification_report_df = pd.DataFrame(performance[3]).T
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(classification_report_df.iloc[:, :3], annot=True, cmap="YlGnBu", cbar=False, fmt=".2f", linewidths=0.5)

    plt.show()

if performance[4] is not None:
# Create a heatmap of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(performance[4], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 16})
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title(f'{performance[0]} : Confusion Matrix', fontsize=16)
    plt.show()


In [None]:
model_performance

In [None]:
def save_model():
    
    model = MLPClassifier(hidden_layer_sizes=(1024,512,256,128,64,32),max_iter= 1000)
    
    return joblib.dump(model, 'customer_churn_nn.joblib')
    
save_model()
    