In [None]:
#importing required libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib .pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn .model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

import warnings
warnings.filterwarnings("ignore")
sns.set_theme(color_codes=True)


In [None]:
#load the data
orignal_data = pd.read_csv("/kaggle/input/customer-churn/churn.csv")
orignal_data.head()

In [None]:
orignal_data.info()

In [None]:
orignal_data.select_dtypes(include="object").nunique()

In [None]:
orignal_data.select_dtypes(include="int").nunique()

In [None]:
orignal_data.select_dtypes(include="float").nunique()

In [None]:
def drop_unwanted(data):
    data= data.drop([ "Phone","Account Length","Area Code" ,"VMail Message","Day Mins","Night Mins" ,"Intl Mins","Eve Mins"],axis=1)
    return data

In [None]:
data_file= drop_unwanted(orignal_data)
data_file

In [None]:
# Set the plot size
plt.figure(figsize=(26, 8))

# Define a custom color palette
colors = ['#FF6F61', '#6B4226']

ax = sns.countplot(x="State", hue="Churn?", data=data_file, palette=colors)
ax.set_title("Churn by State", fontsize=16)
plt.xticks(rotation=75, fontsize=12)
plt.ylabel("Count", fontsize=14)

for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom', fontsize=12)

legend_labels = ['No Churn', 'Churn']
plt.legend(title="Churn Status", labels=legend_labels, fontsize=12, title_fontsize=14)
sns.despine()
plt.show()

In [None]:
#required plot size
plt.figure(figsize=(26, 8))
# plot visualization
ax= sns.countplot(x="State",hue="VMail Plan",data = data_file ,palette="mako")
#set a title
ax.set_title("visualsation of VMail plan by state")
# create a rotation pattern
plt.xticks(rotation=75)
ax.bar_label(ax.containers[0],fmt="%.0f",label_type="edge")
ax.bar_label(ax.containers[1],fmt="%.0f",label_type="edge")
#show plots
plt.show()

In [None]:
#required plot size
plt.figure(figsize=(26, 8))
# plot visualization
ax= sns.countplot(x="State",hue="Int'l Plan",data = data_file ,palette="magma")
#set a title
ax.set_title("visualsation of Intl plan by state")
# create a rotation pattern
plt.xticks(rotation=75)
ax.bar_label(ax.containers[0],fmt="%.0f",label_type="edge")
ax.bar_label(ax.containers[1],fmt="%.0f",label_type="edge")
# show plot
plt.show()

In [None]:
# Get the names of all columns with data type ‘object’ (categorical columns)
cat_vars = data_file.select_dtypes(include="object").columns.tolist()

# Create the stacked density plot
num_cols = len(cat_vars)
num_rows = (num_cols + 2) // 3

# Define a custom color palette
colors = ['#FF6F61', '#6B4226']

# Create subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(20, 8*num_rows))
axs = axs.flatten()

for i, var in enumerate(cat_vars):
    sns.histplot(data=data_file, x=var, hue="Churn?", stat='density', multiple="stack", ax=axs[i], palette=colors)
    axs[i].set_title(var, fontsize=14)  # Adjust title font size
    axs[i].set_ylabel("Density", fontsize=12)  # Adjust y-axis label font size
    axs[i].set_xlabel("")  # Remove x-axis label
    axs[i].tick_params(axis="x", rotation=45, labelsize=10)  # Adjust x-axis label size and rotation

# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# Adjust spacing between subplots and title
fig.tight_layout()
fig.suptitle("Stacked Density Plots of Categorical Columns by Churn Status", fontsize=16)
fig.subplots_adjust(top=0.9)  # Adjust top spacing for the title
plt.show()


In [None]:
#  Variables of all columns with data type ‘int'
int_vars = data_file.select_dtypes(include="int").columns.tolist()

#  figure with subplots

num_cols = len(int_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))

axs = axs.flatten()

 

# Create a  histplot for each integer variable using Seaborn with hue='churn?'
for i, var in enumerate(int_vars):
    sns.histplot(x=var, hue="VMail Plan" , data=data_file, ax=axs[i],palette="rocket")
    fig.suptitle("Visualisation of VMail plan by all numeric variables")

    axs[i].set_title(var)

# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# create spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
# Variables of all columns with data type ‘int'
int_vars = data_file.select_dtypes(include="int").columns.tolist()

# Create a figure with subplots

num_cols = len(int_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))

axs = axs.flatten()

 

# Create a histplot for each integer variable using Seaborn with hue='churn?'
for i, var in enumerate(int_vars):
    sns.histplot(x=var, hue="Int'l Plan", data=data_file, ax=axs[i],palette="rocket")
    fig.suptitle("Visualisation of Int'l plan by all numeric variables")
    axs[i].set_title(var)

# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# Create spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
int_vars = data_file.select_dtypes(include="int").columns.tolist()

# Create a figure with subplots

num_cols = len(int_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))

axs = axs.flatten()

 

# Create a histplot for each integer variable using Seaborn with hue='attrition'
for i, var in enumerate(int_vars):
    sns.histplot(x=var, hue="Churn?", data=data_file, ax=axs[i],palette="mako")
    fig.suptitle(" Visualization of numeric variables in relationship with Target Variable ( churn )")
    axs[i].set_title(var)

# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# Adjust spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
 # Get the names of all columns with data type ‘int'
int_vars = data_file.select_dtypes(include="int").columns.tolist()

# Create a figure with subplots

num_cols = len(int_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))


axs = axs.flatten()

 

# Create a box plot for each integer variable using Seaborn
for i, var in enumerate(int_vars):
    sns.boxplot(x=data_file[var], ax=axs[i],palette="rocket")
    fig.suptitle(" 0utlier Visualization for all numeric variables")
    axs[i].set_title(var)
    
# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])
        # Adjust spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
# Get the names of all columns with data type ‘int'
int_vars = data_file.select_dtypes(include="int").columns.tolist()

# Create a figure with subplots

num_cols = len(int_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))

axs = axs.flatten()

 

# Create a box plot for each integer variable using Seaborn with hue='Churn?'
for i, var in enumerate(int_vars):
    sns.boxplot(y=var, x="Churn?", data=data_file, ax=axs[i],palette="magma")
    fig.suptitle("Outlier Visualization of Target variable(churn) by all numeric variables")
    axs[i].set_title(var)
    axs[i].tick_params(axis="x", rotation=90)
# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# Adjust spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
# Get the names of all columns with data type ‘float'
float_vars = data_file.select_dtypes(include="float").columns.tolist()

# Create a figure with subplots

num_cols = len(float_vars)

num_rows=(num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))

axs = axs.flatten()

 

# Create a histplot for each integer variable using Seaborn with hue='attrition'
for i, var in enumerate(int_vars):
    sns.histplot(x=var, hue="Churn?", data=data_file, ax=axs[i],palette="cubehelix")
    fig.suptitle(" Churn Visualization of CallS  by (Float numeric variables)")
    axs[i].set_title(var)

# Remove any extra empty subplots if needed
if num_cols < len(axs):
    for i in range(num_cols, len(axs)):
        fig.delaxes(axs[i])

# Adjust spacing between subplots
    fig.tight_layout()

# Show plot
    plt.show()

In [None]:
for col in data_file.select_dtypes(include=["object"]).columns:

# Print the column name and the unique values
  print(f"{col}: {data_file[col].unique()}")

In [None]:
for col in data_file.select_dtypes(include=[ "object"]).columns:
    # Initialize a LabelEncoder object
    label_encoder = preprocessing.LabelEncoder()
    # Fit the encoder to the unique values in the column
    label_encoder.fit(data_file[col].unique())
    # Transform the column using the encoder
    data_file[col] = label_encoder.transform(data_file[col])
    # Print the column name and the unique encoded values
    print(f"{col}: {data_file[col].unique()}")

In [None]:
# ploting heatmap to view correlation patterns in the data
plt.figure(figsize = (20,12))
sns.heatmap(data_file.corr(),fmt=".2g", annot= True)
plt.show()

In [None]:
#sns.pairplot(data=data_file, hue= "Churn?")
#plt.show()

In [None]:
def preprocess_data(df, columns_to_remove_outliers=None, threshold=3, test_size=0.3, random_state=0):
    # Separate features (X) and target variable (y)
    X = df.drop("Churn?", axis=1)
    y = df["Churn?"]
    
    selected_columns = ["Day Calls", "Day Charge", "Night Calls", "Eve Charge", "Night Calls",
                    "Night Charge", "Intl Calls", "CustServ Calls"]
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

    if columns_to_remove_outliers:
        # Calculate the Z-scores for the selected columns in the training data
        z_scores = np.abs(stats.zscore(X_train[columns_to_remove_outliers]))

        # Find the outlier indices based on the threshold
        outlier_indices = np.where(z_scores > threshold)[0]

        # Remove the outliers from the training data
        X_train = X_train.drop(X_train.index[outlier_indices])
        y_train = y_train.drop(y_train.index[outlier_indices])

    # Balancing the target variable using SMOTE
    smote = SMOTE(sampling_strategy="minority")
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    return X_train_resampled, X_test, y_train_resampled, y_test,selected_columns

X_train, X_test, y_train, y_test,selected_columns = preprocess_data(data_file)

In [None]:
def train_evaluate_classifier(classifier, X_train, X_test, y_train, y_test, param_grid=None):
    # Hyperparameter tuning if a parameter grid is provided
    if param_grid:
        grid_search = GridSearchCV(classifier, param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
    else:
        best_params = None
        best_score = None

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Print evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)

    # Plot feature importance (for classifiers that support it)
    if hasattr(classifier, 'coef_') or hasattr(classifier, 'feature_importances_'):
        if hasattr(classifier, 'coef_'):
            feature_importance = classifier.coef_[0]
        else:
            feature_importance = classifier.feature_importances_

        plt.figure(figsize=(15, 6))
        plt.bar(X_train.columns, feature_importance)
        plt.xticks(rotation=90)
        plt.title("Feature Importance Scores")
        plt.show()

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 5))
    sns.heatmap(data=cm, linewidths=.5, annot=True, cmap="Blues")
    plt.ylabel("Actual label")
    plt.xlabel("Predicted label")
    plt.title(f"Confusion Matrix (Accuracy: {accuracy:.2f})")
    plt.show()

    return best_params, best_score

In [None]:
# Logistic Regression example
lr = LogisticRegression(random_state=42)
param_grid_lr = {'C': [1], 'penalty': ['l2'], 'solver': ['liblinear'],'max_iter': [100]}


'''{
        'penalty': ['l1', 'l2'],  # Regularization penalty (L1 or L2)
        'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
        'solver': ['liblinear', 'lbfgs', 'saga'],  # Solver algorithm
        'max_iter': [100, 500, 1000],  # Maximum number of iterations
    }'''
best_params_lr, best_score_lr = train_evaluate_classifier(lr, X_train, X_test, y_train, y_test, param_grid_lr)

In [None]:
best_params_lr

In [None]:
# Decision Tree Classifier example
dt = DecisionTreeClassifier()
param_grid_dt = {'max_depth': [8],
 'min_samples_leaf': [1],
 'min_samples_split': [2],
 'random_state': [42]}
'''{
"max_depth": [3, 4, 5, 6, 7, 8],
"min_samples_split": [2, 3, 4],
"min_samples_leaf": [1, 2, 3, 4],
"random_state": [0, 42]}'''
best_params_dt, best_score_dt = train_evaluate_classifier(dt, X_train, X_test, y_train, y_test, param_grid_dt)

In [None]:
best_params_dt

In [None]:
# Random Forest Classifier example
rfc = RandomForestClassifier(class_weight="balanced")


param_grid_rfc = {'criterion': ['gini'],
 'max_depth': [5],
 'max_features': ['log2'],
 'min_samples_leaf': [1],
 'min_samples_split': [5],
 'n_estimators': [200]}

'''param_grid_rfc = {
    "n_estimators": [100, 200],
    "max_depth": [5,],
    "min_samples_split": [2, 5, ],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "criterion": ["gini", "entropy"],
}'''
best_params_rfc, best_score_rfc = train_evaluate_classifier(rfc, X_train, X_test, y_train, y_test, param_grid_rfc)

In [None]:
def preprocess_and_drop(data):
    # Drop unwanted columns
    columns_to_drop = ["Churn?", "State", "Phone", "Account Length", "Area Code", "VMail Message", "Day Mins", "Night Mins", "Intl Mins", "Eve Mins"]
    data = data.drop(columns_to_drop, axis=1)

    # Label encode categorical columns
    categorical_cols = data.select_dtypes(include=["object"]).columns
    label_encoder = LabelEncoder()
    data[categorical_cols] = data[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))

    # Standardize numeric columns
    numeric_cols = data.select_dtypes(include=["float64", "int64"]).columns
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    return data


In [None]:
data_file_2 = preprocess_and_drop(orignal_data)
data_file_2

In [None]:
max_k = 10

def find_optimal_k_elbow(data, max_k):
    inertias = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    
    plt.plot(range(1, max_k + 1), inertias, marker="o")
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("Inertia")
    plt.title("Elbow Method")
    plt.show()

# Example usage:
# Assuming you have your scaled feature data 'scaled_features_cul_df' and 'max_k' defined
find_optimal_k_elbow(data_file_2, max_k)

In [None]:
def cluster_and_visualize(data, max_k):
    # Standardize data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    silhouette_scores = []
    
    # Find optimal number of clusters using silhouette scores
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(scaled_data)
        labels = kmeans.labels_
        silhouette_scores.append(silhouette_score(scaled_data, labels))
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(range(2, max_k + 1), silhouette_scores, marker="o")
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Scores")
    
    # Determine the optimal number of clusters
    optimal_k = silhouette_scores.index(max(silhouette_scores)) + 3
    print("Optimal number of clusters is:", optimal_k)
    
    # Fit KMeans with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    pred = kmeans.fit_predict(scaled_data)
    data["cluster"] = pred + 1
    
    plt.subplot(1, 2, 2)
    sns.countplot(x=data["cluster"])
    plt.title("Distribution of Clusters")
    
    plt.tight_layout()
    plt.show()
    
    # Create histograms for each cluster
    for col in data.columns:
        if col != "cluster":
            diag = sns.FacetGrid(data, col="cluster", hue="cluster", palette="Set1")
            diag.map(plt.hist, col, bins=6, ec="k")

In [None]:
cluster_and_visualize(data_file_2, max_k)

In [None]:
selected_columns=["Day Calls","Eve Calls","Night Calls"]

In [None]:
X=data_file_2[selected_columns]

In [None]:
# Function to find optimal k using the Elbow Method
def find_optimal_k_elbow(X, max_k):
    inertias = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans .fit(X)
        inertias.append(kmeans.inertia_)
            # Plot Elbow graph
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_k + 1), inertias, marker="o")
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("Inertia")
    plt.title("Elbow Method")

    plt.show()

# Return the fitted KMeans model with optimal k
    optimal_k = np.argmin(np.diff(inertias)) + 3

    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    kmeans.fit(X)

    return kmeans


# Function to find optimal k using the Silhouette Score
def find_optimal_k_silhouette(X, max_k):
    silhouette_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans .fit(X)
        labels = kmeans.labels_
        silhouette_scores.append(silhouette_score(X, labels))

# Plot Silhouette Score graph

    plt.figure(figsize=(8, 6))
    plt.plot(range(2, max_k + 1), silhouette_scores, marker="o")
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Score")
    plt.show()
    
    # Return the fitted KMeans model with optimal k
    optimal_k = np.argmax(silhouette_scores) + 2

    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    kmeans . fit(X)

    return kmeans

In [None]:
kmeans_model = find_optimal_k_elbow(X, max_k)

# Predict cluster Labels and add ‘label’ column to the DataFrame
data_file_2["label"] = kmeans_model.predict(X)
data_file_2

In [None]:
# Create a figure and a 3D axis
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection="3d")

# Create a colormap for the clusters
colors = plt.cm.tab10(data_file_2["label"] / float(max(data_file_2["label"])))

# Plot the data points
scatter = ax.scatter(
    data_file_2["Day Calls"],
    data_file_2["Eve Calls"],
    data_file_2["Night Calls"],
    c=colors,
    s=60,  # Adjust the size of the data points
    alpha=0.7,  # Set transparency
)

# Set labels for the axes
ax.set_xlabel("Day Calls")
ax.set_ylabel("Eve Calls")
ax.set_zlabel("Night Calls")

# Set the title
ax.set_title("Clusters of Subscribers Based on Their Call Patterns")

# Create a color bar
cbar = plt.colorbar(scatter)
cbar.set_label("Cluster Labels")

# Beautify the plot
ax.grid(True, linestyle='--', alpha=0.5)  # Add grid lines
ax.xaxis.pane.fill = False  # Remove the background of the axes
ax.yaxis.pane.fill = False
ax.zaxis.pane.fill = False

# Customize the view angle
ax.view_init(elev=20, azim=60)

plt.show()
