Kaggle Dataset: Bank Marketing

https://www.kaggle.com/datasets/henriqueyamahata/bank-marketing

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
# Load the data from pre-processed files
data = pd.read_csv("data/processed_data.csv")

print(data.shape)

(2213, 34)


In [3]:
# Split the data into features and target variable
X = data[['Age_Group', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Customer_Tenure_Days', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain']]
y = data['Response']

In [4]:
# Perform one-hot encoding for categorical features
categorical_features = ['Age_Group', 'Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain']
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
print(X.shape)
X.head()

(2213, 38)


Unnamed: 0,Income,Customer_Tenure_Days,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,...,Kidhome_1,Kidhome_2,Teenhome_1,Teenhome_2,AcceptedCmp1_1,AcceptedCmp2_1,AcceptedCmp3_1,AcceptedCmp4_1,AcceptedCmp5_1,Complain_1
0,58138.0,4632,58,635,88,546,172,88,88,3,...,False,False,False,False,False,False,False,False,False,False
1,46344.0,4082,38,11,1,6,2,1,6,2,...,True,False,True,False,False,False,False,False,False,False
2,71613.0,4281,26,426,49,127,111,21,42,1,...,False,False,False,False,False,False,False,False,False,False
3,26646.0,4108,26,11,4,20,10,3,5,2,...,True,False,False,False,False,False,False,False,False,False
4,58293.0,4130,94,173,43,118,46,27,15,5,...,True,False,False,False,False,False,False,False,False,False


In [5]:
# Perform scaling on numerical features
scaler = StandardScaler()

numerical_features = ['Income', 'Customer_Tenure_Days', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

X.head(5)

Unnamed: 0,Income,Customer_Tenure_Days,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,...,Kidhome_1,Kidhome_2,Teenhome_1,Teenhome_2,AcceptedCmp1_1,AcceptedCmp2_1,AcceptedCmp3_1,AcceptedCmp4_1,AcceptedCmp5_1,Complain_1
0,0.234435,1.527968,0.310773,0.978107,1.552498,1.690807,2.454109,1.4842,0.852974,0.350656,...,False,False,False,False,False,False,False,False,False,False
1,-0.234084,-1.189356,-0.380424,-0.872266,-0.637451,-0.71802,-0.650863,-0.633818,-0.733466,-0.169104,...,True,False,True,False,False,False,False,False,False,False
2,0.769733,-0.206179,-0.795142,0.358351,0.570796,-0.178264,1.339972,-0.146917,-0.03698,-0.688863,...,False,False,False,False,False,False,False,False,False,False
3,-1.016592,-1.060901,-0.795142,-0.872266,-0.561936,-0.655569,-0.504747,-0.585128,-0.752813,-0.169104,...,True,False,False,False,False,False,False,False,False,False
4,0.240593,-0.952208,1.554927,-0.391881,0.419766,-0.218411,0.152777,-0.000847,-0.559344,1.390175,...,True,False,False,False,False,False,False,False,False,False


In [6]:
X_np = X.to_numpy()
y_np = y.to_numpy()

Implement Kmeans Clustering Algorithm

In [7]:
def find_closest_centroids(X, centroids):
    """
    Computes the centroid memberships for every example
    
    Args:
        X (ndarray): (m, n) Input values      
        centroids (ndarray): k centroids
    
    Returns:
        idx (array_like): (m,) closest centroids
    
    """

    # Set K
    K = centroids.shape[0]

    idx = np.zeros(X.shape[0], dtype=int)

    for i in range(X.shape[0]):
          # Array to hold distance between X[i] and each centroids[j]
          distance = [] 
          for j in range(centroids.shape[0]):
              norm_ij = np.linalg.norm(X[i] - centroids[j])
              distance.append(norm_ij)

          idx[i] = np.argmin(distance)
    
    return idx

In [8]:
def compute_centroids(X, idx, K):
    """
    Returns the new centroids by computing the means of the 
    data points assigned to each centroid.
    
    Args:
        X (ndarray):   (m, n) Data points
        idx (ndarray): (m,) Array containing index of closest centroid for each 
                       example in X. Concretely, idx[i] contains the index of 
                       the centroid closest to example i
        K (int):       number of centroids
    
    Returns:
        centroids (ndarray): (K, n) New centroids computed
    """
    
    # Useful variables
    m, n = X.shape

    centroids = np.zeros((K, n))

    for k in range(K):   
          points = X[idx == k]  
          centroids[k] = np.mean(points, axis = 0)
    
    return centroids

In [9]:
def run_kmeans(X, initial_centroids, max_iters=10, log=True):
    """
    Runs the K-Means algorithm on data matrix X, where each row of X
    is a single example
    """
    
    # Initialize values
    m, n = X.shape
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    previous_centroids = centroids    
    idx = np.zeros(m)
    
    # Run K-Means
    for i in range(max_iters):
        
        #Output progress
        if log:
            print("K-Means iteration %d/%d" % (i, max_iters-1))
        
        # For each example in X, assign it to the closest centroid
        idx = find_closest_centroids(X, centroids)
            
        # Given the memberships, compute new centroids
        centroids = compute_centroids(X, idx, K)

    return centroids, idx

In [10]:
def kmeans_init_centroids(X, K):
    """
    This function initializes K centroids that are to be 
    used in K-Means on the dataset X
    
    Args:
        X (ndarray): Data points 
        K (int):     number of centroids/clusters
    
    Returns:
        centroids (ndarray): Initialized centroids
    """
    
    # Randomly reorder the indices of examples
    randidx = np.random.permutation(X.shape[0])
    
    # Take the first K examples as centroids
    centroids = X[randidx[:K]]
    
    return centroids

In [11]:
# Set initial centroids
K = 2

initial_centroids = kmeans_init_centroids(X_np, K)

print(initial_centroids)

[[0.8711116550096073 0.3471673846659758 -1.3826588391979737
  2.063422192310181 -0.36056112044564975 0.32580501480967095
  -0.39515944622322724 -0.3660224898009498 -0.61738494290674
  -0.6888633320189731 0.3328424821788868 0.4539699879575771
  1.29068803426126 -1.370045588382378 True False False False False False
  True False True False False False False False False False False False
  False False False False False False]
 [-0.5326193769633656 -1.3326330292807533 1.278448368615414
  -0.6795188231360328 -0.662623050303206 -0.7135591827582353
  -0.6873920674532452 -0.6581628251926913 -0.5399976374039014
  -0.6888633320189731 -0.39680626775858396 -0.22945557894726507
  -1.170837437952609 -0.13269928702601722 False False True False False
  False True False False True False False False False False False True
  False False False False False False False]]


In [12]:
max_iters = 10

centroids, y_pred = run_kmeans(X_np, initial_centroids, max_iters)

K-Means iteration 0/9
K-Means iteration 1/9
K-Means iteration 2/9
K-Means iteration 3/9
K-Means iteration 4/9
K-Means iteration 5/9
K-Means iteration 6/9
K-Means iteration 7/9
K-Means iteration 8/9
K-Means iteration 9/9


In [13]:
# Print final centroids
print(centroids)

[[ 0.79272116  0.12747844  0.02442568  0.88118466  0.75711348  0.86853117
   0.77183327  0.7496997   0.63277815 -0.15091814  0.61901962  0.90598735
   0.85380894 -0.67502752  0.18961625  0.2731377   0.31489842  0.00112867
   0.53611738  0.14672686  0.22911964  0.          0.11173815  0.37358916
   0.21218962  0.25733634  0.04288939  0.          0.0778781   0.00225734
   0.38487585  0.02031603  0.1489842   0.02595937  0.08126411  0.12189616
   0.18171558  0.00790068]
 [-0.52927728 -0.08511372 -0.01630833 -0.58834183 -0.50550304 -0.57989346
  -0.51533103 -0.50055308 -0.4224879   0.10076373 -0.41330172 -0.60490188
  -0.57006384  0.4506966   0.2720422   0.25244913  0.21100226  0.03993971
   0.48304446  0.17709118  0.20874152  0.00226074  0.09947249  0.39638282
   0.21250942  0.25923135  0.02863602  0.00150716  0.61567445  0.0331575
   0.50941974  0.02486812  0.0075358   0.00527506  0.06857573  0.04220045
   0.          0.00979653]]


In [14]:
# Calculate the accuracy of kmeans clustering
accuracy = max(np.mean(y_pred == y_np), 1 - np.mean(y_pred == y_np))
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 62.99%


Run KMeans Clustering multiple times with different initial centroids

In [17]:
# init final accuracy and centroids
final_accuracy = 0.0
final_centroids = None

for i in range(10):
    initial_centroids = kmeans_init_centroids(X_np, K)

    centroids, y_pred = run_kmeans(X_np, initial_centroids, max_iters, log=False)
    accuracy = max(np.mean(y_pred == y_np), 1 - np.mean(y_pred == y_np))
    if accuracy > final_accuracy:
        final_accuracy = accuracy
        final_centroids = centroids

print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Centroids: {final_centroids}")

Final Test Accuracy: 63.22%
Final Centroids: [[ 0.80005878  0.11827411  0.02303772  0.88538516  0.76820116  0.88133912
   0.78523073  0.76266865  0.62572051 -0.1661338   0.61094289  0.91557514
   0.86414083 -0.69080139  0.19085714  0.27428571  0.31314286  0.00114286
   0.53714286  0.14857143  0.22971429  0.          0.112       0.37028571
   0.21485714  0.25714286  0.04342857  0.          0.07314286  0.00228571
   0.38057143  0.01942857  0.14971429  0.02628571  0.08114286  0.12114286
   0.184       0.008     ]
 [-0.52320735 -0.07734667 -0.01506577 -0.57900748 -0.5023737  -0.57636154
  -0.51351038 -0.49875566 -0.4091969   0.10864505 -0.39953291 -0.59875056
  -0.56511452  0.45175726  0.27055306  0.25186846  0.21300448  0.03961136
   0.48281016  0.17563528  0.20852018  0.00224215  0.09940209  0.39835575
   0.21076233  0.2593423   0.0284006   0.00149477  0.61434978  0.0328849
   0.51121076  0.02541106  0.00822123  0.00523169  0.06875934  0.04334828
   0.          0.00971599]]


No significant improvement was obsereved in the accuracy of the clustering algorithm.

Try KMeans clustering method provided by scikit-learn:

In [19]:
# KMeans clustering
kmeans = KMeans(n_clusters=K, random_state=42)
kmeans.fit(X)

# Get final centroids
centroids = kmeans.cluster_centers_

# Get cluster labels (indexes)
labels = kmeans.labels_

print("Final Centroids:\n", centroids)

accuracy = max(np.mean(labels == y_np), 1 - np.mean(labels == y_np))
print(f"Accuracy: {accuracy * 100:.2f}%")

Final Centroids:
 [[-5.29277278e-01 -8.51137153e-02 -1.63083313e-02 -5.88341830e-01
  -5.05503045e-01 -5.79893460e-01 -5.15331028e-01 -5.00553078e-01
  -4.22487901e-01  1.00763730e-01 -4.13301718e-01 -6.04901878e-01
  -5.70063842e-01  4.50696595e-01  2.72042200e-01  2.52449133e-01
   2.11002261e-01  3.99397136e-02  4.83044461e-01  1.77091183e-01
   2.08741522e-01  2.26073851e-03  9.94724943e-02  3.96382818e-01
   2.12509420e-01  2.59231349e-01  2.86360211e-02  1.50715901e-03
   6.15674454e-01  3.31574981e-02  5.09419744e-01  2.48681236e-02
   7.53579503e-03  5.27505652e-03  6.85757347e-02  4.22004521e-02
  -2.22044605e-16  9.79653353e-03]
 [ 7.92721160e-01  1.27478443e-01  2.44256835e-02  8.81184660e-01
   7.57113477e-01  8.68531175e-01  7.71833266e-01  7.49699700e-01
   6.32778154e-01 -1.50918137e-01  6.19019616e-01  9.05987349e-01
   8.53808937e-01 -6.75027519e-01  1.89616253e-01  2.73137698e-01
   3.14898420e-01  1.12866817e-03  5.36117381e-01  1.46726862e-01
   2.29119639e-01 -4.33