In [234]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chi2_contingency
from time import time
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from math import log2
from scipy.stats import entropy
from scipy import stats


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from time import time
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

import warnings
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [203]:
Total_Accuracy = {
    "before_cluster": [],
    "after_cluster": []
}

In [204]:
train = pd.read_csv("./Datasets/Customer_Segmentation_4(20A)/customer_segmentation_data.csv")
data = train.copy()
print(train.shape)
train.head()

(53503, 20)


Unnamed: 0,Customer ID,Age,Gender,Marital Status,Education Level,Geographic Information,Occupation,Income Level,Behavioral Data,Purchase History,Interactions with Customer Service,Insurance Products Owned,Coverage Amount,Premium Amount,Policy Type,Customer Preferences,Preferred Communication Channel,Preferred Contact Time,Preferred Language,Segmentation Group
0,84966,23,Female,Married,Associate Degree,Mizoram,Entrepreneur,70541,policy5,04-10-2018,Phone,policy2,366603,2749,Group,Email,In-Person Meeting,Afternoon,English,Segment5
1,95568,26,Male,Widowed,Doctorate,Goa,Manager,54168,policy5,11-06-2018,Chat,policy1,780236,1966,Group,Mail,In-Person Meeting,Morning,French,Segment5
2,10544,29,Female,Single,Associate Degree,Rajasthan,Entrepreneur,73899,policy5,06-05-2021,Email,policy3,773926,4413,Group,Email,Mail,Evening,German,Segment3
3,77033,20,Male,Divorced,Bachelor's Degree,Sikkim,Entrepreneur,63381,policy5,09-02-2018,Chat,policy2,787815,4342,Family,Text,In-Person Meeting,Anytime,French,Segment3
4,88160,25,Female,Separated,Bachelor's Degree,West Bengal,Manager,38794,policy1,09-10-2018,Chat,policy4,366506,1276,Family,Email,Text,Weekends,English,Segment2


In [205]:
def calculate_gain_ratio(X, y, feature):

    # Calculate information gain
    info_gain = mutual_info_score(X[feature], y)
    
    # Calculate split information
    split_info = entropy(X[feature].value_counts(normalize=True), base=2)
    
    # Avoid division by zero
    if split_info == 0:
        return np.inf
    
    # Calculate gain ratio
    gain_ratio = info_gain / split_info
    
    return gain_ratio

def calculate_gini(y):

    # Count occurrences of each class
    class_counts = y.value_counts()
    
    # Calculate probabilities
    probabilities = class_counts / len(y)
    
    # Calculate Gini impurity
    gini_impurity = 1 - np.sum(probabilities ** 2)
    
    return gini_impurity

def calculate_gini_index(X, y, feature):

    # Get unique values of the feature
    values = X[feature].unique()
    
    gini_index = 0
    # Calculate weighted average of Gini impurities for each value of the feature
    for value in values:
        subset_y = y[X[feature] == value]
        subset_weight = len(subset_y) / len(y)
        gini_index += subset_weight * calculate_gini(subset_y)
    
    return gini_index


In [206]:
start = time()

# Separate features and target variable
X = train.drop(['Segmentation Group'], axis=1).copy()  # Features
y = train['Segmentation Group']  # Target variable

df_numerical = train.select_dtypes(include=['int', 'float']).copy()
df_numerical['Segmentation Group'] = train['Segmentation Group'].copy()

df_categorical = train.select_dtypes(include=['object']).copy()
target = 'Segmentation Group'
# Calculate information gain for each feature
NumX = X.select_dtypes(include=['int', 'float']).columns
info_gain = []
for col in X.columns:
    if col in NumX:
        num_bins = 3  # You can adjust this number according to your preference
        bin_width = (X[col].max() - X[col].min()) / num_bins
        
        # Create bins using equal-width binning
        bins = [X[col].min()-1 + i * bin_width for i in range(num_bins)]
        bins.append(X[col].max())
        
        # Binning numerical data
        X[col] = pd.cut(X[col], bins=bins, labels=[f'Bin_{i+1}' for i in range(num_bins)])
        info_gain.append(mutual_info_score(X[col], y))
    else:
        info_gain.append(mutual_info_score(X[col], y))
        # print('cat:',col)
        # info_gain.append(information_gain(train, col, target))




# Print information gain for each feature
feature_metrics = pd.DataFrame({"Features": X.columns})
feature_metrics['Info_Gain'] = info_gain

gain_ratios = {}
for col in X.columns:
    gain_ratios[col] = calculate_gain_ratio(X, y, col)

# Print gain ratio for each feature
# gain_ratios
feature_metrics['Gain_Ratio'] = gain_ratios.values()


# Calculate Gini impurity for the entire dataset
gini_impurity = calculate_gini(y)
print(f"Gini impurity for the entire dataset: {gini_impurity:.4f}")




# Gini Index....

gini_indices = {}
for feature in X.columns:
    gini_indices[feature] = calculate_gini_index(X, y, feature)


feature_metrics['Gini_Index'] = gini_indices.values()

# gini_impurities = {}
# for feature in X.columns:
#     feature_gini = calculate_gini_impurity(y[X[feature] == X[feature]])
#     gini_impurities[feature] = feature_gini


# feature_metrics['Gini'] = gini_impurities.values()


# print(feature_metrics)

print(f"Run Time: {time()-start:.3f} sec")
sorted_df = feature_metrics.sort_values(by='Gain_Ratio', ascending=False)
print("Sorting According to: Gain Ratio")
sorted_df

Gini impurity for the entire dataset: 0.7938
Run Time: 10.405 sec
Sorting According to: Gain Ratio


Unnamed: 0,Features,Info_Gain,Gain_Ratio,Gini_Index
9,Purchase History,0.09954,0.009239,0.757042
11,Insurance Products Owned,0.002539,0.001109,0.792786
3,Marital Status,0.002371,0.001032,0.792791
5,Geographic Information,0.005265,0.001031,0.791822
6,Occupation,0.002855,0.000905,0.792643
8,Behavioral Data,0.001981,0.000873,0.793055
10,Interactions with Customer Service,0.001876,0.000818,0.793018
15,Customer Preferences,0.001842,0.000796,0.793033
4,Education Level,0.001682,0.000728,0.793122
17,Preferred Contact Time,0.001558,0.000685,0.793172


In [207]:
selected_df_cols = list(sorted_df.head(15).Features)
selected_cols_rc = selected_df_cols.copy()
selected_df_cols.append("Segmentation Group")
selected_df = train[selected_df_cols].copy()
selected_df.head()

Unnamed: 0,Purchase History,Insurance Products Owned,Marital Status,Geographic Information,Occupation,Behavioral Data,Interactions with Customer Service,Customer Preferences,Education Level,Preferred Contact Time,Preferred Communication Channel,Age,Gender,Policy Type,Preferred Language,Segmentation Group
0,04-10-2018,policy2,Married,Mizoram,Entrepreneur,policy5,Phone,Email,Associate Degree,Afternoon,In-Person Meeting,23,Female,Group,English,Segment5
1,11-06-2018,policy1,Widowed,Goa,Manager,policy5,Chat,Mail,Doctorate,Morning,In-Person Meeting,26,Male,Group,French,Segment5
2,06-05-2021,policy3,Single,Rajasthan,Entrepreneur,policy5,Email,Email,Associate Degree,Evening,Mail,29,Female,Group,German,Segment3
3,09-02-2018,policy2,Divorced,Sikkim,Entrepreneur,policy5,Chat,Text,Bachelor's Degree,Anytime,In-Person Meeting,20,Male,Family,French,Segment3
4,09-10-2018,policy4,Separated,West Bengal,Manager,policy1,Chat,Email,Bachelor's Degree,Weekends,Text,25,Female,Family,English,Segment2


In [208]:
label_encoders = {}
for col in selected_df.columns:
    label_encoders[col] = LabelEncoder()
    selected_df[col] = label_encoders[col].fit_transform(selected_df[col])

selected_df.head()

Unnamed: 0,Purchase History,Insurance Products Owned,Marital Status,Geographic Information,Occupation,Behavioral Data,Interactions with Customer Service,Customer Preferences,Education Level,Preferred Contact Time,Preferred Communication Channel,Age,Gender,Policy Type,Preferred Language,Segmentation Group
0,270,1,1,22,3,4,4,0,0,0,1,5,0,2,0,4
1,942,0,4,10,5,4,0,2,2,3,1,8,1,2,1,4
2,387,2,3,27,3,4,1,0,0,2,2,11,0,2,2,2
3,582,1,0,28,3,4,0,4,1,1,1,2,1,1,1,2
4,630,3,2,34,5,0,0,0,1,4,4,7,0,1,0,1


In [209]:
x_df= selected_df.drop("Segmentation Group", axis='columns').copy()
y_df= selected_df['Segmentation Group'].copy()
sp = train_test_split(x_df,y_df,test_size=0.33, random_state=14)
x_train = sp[0]
x_test = sp[1]
y_train = sp[2]
y_test = sp[3]

Naive Bayes

In [210]:
naiveBayes = GaussianNB()
naiveBayes.fit(x_train,y_train)
naiveBayes_y_pred = naiveBayes.predict(x_test)
nb_result = classification_report(y_test, naiveBayes_y_pred, output_dict=True)
print(f"Accuracy: {(nb_result.get('accuracy')*100):.2f}%")
print(f"Precision: {(nb_result.get('weighted avg').get('precision')):.3f}")
print(f"Recall: {(nb_result.get('weighted avg').get('recall')):.3f}")
print(f"F1-Score: {(nb_result.get('weighted avg').get('f1-score')):.3f}")
warnings.filterwarnings('ignore')
Total_Accuracy['before_cluster'].append(float("{:.2f}".format((nb_result.get('accuracy')*100))))

Accuracy: 26.86%
Precision: 0.240
Recall: 0.269
F1-Score: 0.183


Decision Tree

In [211]:
decision_tree = DecisionTreeClassifier(max_leaf_nodes=5)
decision_tree.fit(x_train,y_train)
decision_tree_y_pred = decision_tree.predict(x_test)
dt_result = classification_report(y_test, decision_tree_y_pred, output_dict=True)
print(f"Accuracy: {(dt_result.get('accuracy')*100):.2f}%")
print(f"Precision: {(dt_result.get('weighted avg').get('precision')):.3f}")
print(f"Recall: {(dt_result.get('weighted avg').get('recall')):.3f}")
print(f"F1-Score: {(dt_result.get('weighted avg').get('f1-score')):.3f}")
warnings.filterwarnings('ignore')
Total_Accuracy['before_cluster'].append(float("{:.2f}".format((dt_result.get('accuracy')*100))))

Accuracy: 26.76%
Precision: 0.163
Recall: 0.268
F1-Score: 0.167


Random Forest

In [212]:
random_forest = RandomForestClassifier()
random_forest.fit(x_train,y_train)
random_forest_y_pred = random_forest.predict(x_test)
rf_result = classification_report(y_test, random_forest_y_pred, output_dict=True)
print(f"Accuracy: {(rf_result.get('accuracy')*100):.2f}%")
print(f"Precision: {(rf_result.get('weighted avg').get('precision')):.3f}")
print(f"Recall: {(rf_result.get('weighted avg').get('recall')):.3f}")
print(f"F1-Score: {(rf_result.get('weighted avg').get('f1-score')):.3f}")
Total_Accuracy['before_cluster'].append(float("{:.2f}".format((rf_result.get('accuracy')*100))))

Accuracy: 24.79%
Precision: 0.227
Recall: 0.248
F1-Score: 0.222


Bagging

In [213]:
start = time()
bagging = BaggingClassifier(n_estimators=50)
bagging.fit(x_train, y_train)
bagging_y_pred = bagging.predict(x_test)
bagging_result = classification_report(y_test, bagging_y_pred, output_dict=True)
print(f"Accuracy: {(bagging_result.get('accuracy')*100):.2f}%")
print(f"Precision: {(bagging_result.get('weighted avg').get('precision')):.3f}")
print(f"Recall: {(bagging_result.get('weighted avg').get('recall')):.3f}")
print(f"F1-Score: {(bagging_result.get('weighted avg').get('f1-score')):.3f}")
print(f"Run Time: {time()-start:.3f}")
Total_Accuracy['before_cluster'].append(float("{:.2f}".format((bagging_result.get('accuracy')*100))))

Accuracy: 23.24%
Precision: 0.218
Recall: 0.232
F1-Score: 0.219
Run Time: 9.787


AdaBoost

In [214]:
start = time()
adaboost = AdaBoostClassifier(n_estimators=200, random_state=1)
adaboost.fit(x_train, y_train)
adaboost_y_pred = adaboost.predict(x_test)
adaboost_result = classification_report(y_test, adaboost_y_pred, output_dict=True)
print(f"Accuracy: {(adaboost_result.get('accuracy')*100):.2f}%")
print(f"Precision: {(adaboost_result.get('weighted avg').get('precision')):.3f}")
print(f"Recall: {(adaboost_result.get('weighted avg').get('recall')):.3f}")
print(f"F1-Score: {(adaboost_result.get('weighted avg').get('f1-score')):.3f}")
print(f"Run Time: {time()-start:.3f}")
warnings.filterwarnings('ignore')
Total_Accuracy['before_cluster'].append(float("{:.2f}".format((adaboost_result.get('accuracy')*100))))

Accuracy: 27.86%
Precision: 0.262
Recall: 0.279
F1-Score: 0.231
Run Time: 5.875
