In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import manifold #needed for multidimensional scaling (MDS) and t-SNE
from sklearn import cluster #needed for k-Means clustering
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, FunctionTransformer #needed for data preparation

from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

In [3]:
df = pd.read_csv("./bank.csv")
print('(number of examples, number of attributes): ', df.shape)

(number of examples, number of attributes):  (2000, 17)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn import manifold #needed for multidimensional scaling (MDS) and t-SNE
from sklearn.cluster import KMeans

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("bank.csv")

colors = np.array(['orange', 'blue', 'lime', 'khaki', 'pink', 'green', 'purple'])

# points - a 2D array of (x,y) coordinates of data points
# labels - an array of numeric labels in the interval [0..k-1], one for each point
# centers - a 2D array of (x, y) coordinates of cluster centers
# title - title of the plot


def clustering_scatterplot(points, labels, centers, title):
    
    
    n_clusters = np.unique(labels).size
    for i in range(n_clusters):
        h = plt.scatter(points[labels==i,0],
                        points[labels==i,1], 
                        c=colors[i%colors.size],
                        label = 'cluster '+str(i))

    # plot the centers of the clusters
    if centers is not None:
        plt.scatter(centers[:,0], centers[:,1], c='r', marker='*', s=500)

    _ = plt.title(title)
    _ = plt.legend()
    _ = plt.xlabel('x')
    _ = plt.ylabel('y')

# Map month abbreviations to numeric months
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

def parse_date_to_weekday(date_str):
    # Split by underscore -> ['17', 'feb']
    day_str, month_str = date_str.split('_')
    day = int(day_str)
    month = month_map[month_str]
    
    # Use an arbitrary year, e.g., 2023
    dt = datetime(2022, month, day)
    # weekday(): Monday=0, Sunday=6; or use strftime('%A') for the name
    return dt.weekday()



# combine day and month columns into a single column, skip any missing values
df['contact_date'] = df['day'].astype(str) + '_' + df['month']

# extract contact date as a string array.  Apply the function to each date
df['contact_weekday'] = df['contact_date'].apply(parse_date_to_weekday)

# drop the original contact date column
df.drop(columns=['contact_date'], inplace=True)

# Separate features and target
df_X = df.drop(columns=["subscribed"])
df_Y = df["subscribed"]

# fill missing values
df_X['age'] = df_X['age'].fillna(df_X['age'].median())

# fill the missing job values with a new category called 'unknown'
df_X['job'] = df_X['job'].fillna('unknown')

# fill the missing poutcome values with a new category called 'unknown'
df_X['poutcome'] = df_X['poutcome'].fillna('unknown')

# fill the missing contact values with a new category called 'unknown'
df_X['contact'] = df_X['contact'].fillna('unknown')

# fill the missing education values with a new category called 'unknown'
df_X['education'] = df_X['education'].fillna('unknown')

# Encode categorical features

# job
job_dummies = pd.get_dummies(df_X['job'], prefix='job')
df_X = pd.concat([df_X, job_dummies], axis=1)
df_X.drop('job', axis=1, inplace=True)

# marital
marital_dummies = pd.get_dummies(df_X['marital'], prefix='marital')
df_X = pd.concat([df_X, marital_dummies], axis=1)
df_X.drop('marital', axis=1, inplace=True)

# contact
contact_dummies = pd.get_dummies(df_X['contact'], prefix='contact')
df_X = pd.concat([df_X, contact_dummies], axis=1)
df_X.drop('contact', axis=1, inplace=True)

# poutcome
poutcome_dummies = pd.get_dummies(df_X['poutcome'], prefix='poutcome')
df_X = pd.concat([df_X, poutcome_dummies], axis=1)
df_X.drop('poutcome', axis=1, inplace=True)

# default
df_X['default'] = df_X['default'].map({'yes': 1, 'no': 0})

# housing
df_X['housing'] = df_X['housing'].map({'yes': 1, 'no': 0})

# loan
df_X['loan'] = df_X['loan'].map({'yes': 1, 'no': 0})

month_order = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df_X['month'] = df_X['month'].map(month_order)

# add sin and cosine transformation for month
df_X['month_sin'] = np.sin(2 * np.pi * df_X['month'] / 12)          
df_X['month_cos'] = np.cos(2 * np.pi * df_X['month'] / 12)

edu_order = {
    'unknown': -1,
    'primary': 1,
    'secondary': 2,
    'tertiary': 3
}

df_X['education_level'] = df_X['education'].map(edu_order)
df_X['education_unknown'] = (df_X['education'] == 'unknown').astype(int)

# Drop original column if desired
df_X.drop('education', axis=1, inplace=True)

df_X['day_sin'] = np.sin(2 * np.pi * df_X['day'] / max(df_X['day']))

# add sin and cos for contact weekday
df_X['contact_weekday_sin'] = np.sin(2 * np.pi * df_X['contact_weekday'] / 7)
df_X['contact_weekday_cos'] = np.cos(2 * np.pi * df_X['contact_weekday'] / 7)

# handle outliers

# age use IQR clipping
Q1 = df_X['age'].quantile(0.25)
Q3 = df_X['age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_X['age'] = np.clip(df_X['age'], lower_bound, upper_bound)

# balance use log transform
df_X['balance'] = np.log(df_X['balance'] + abs(df_X['balance'].min()) + 1)

# duration use log transform
df_X['duration'] = np.log(df_X['duration'] + abs(df_X['duration'].min()) + 1)

# campaign use IQR clipping
Q1 = df_X['campaign'].quantile(0.25)
Q3 = df_X['campaign'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_X['campaign'] = np.clip(df_X['campaign'], lower_bound, upper_bound)

# previous use log transform
df_X['previous'] = np.log(df_X['previous'] + abs(df_X['previous'].min()) + 1)

# handle Skewed scaling

# age use StandardScaler
df_X['age'] = StandardScaler().fit_transform(df_X[['age']])

# balance use standardScaler
df_X['balance'] = StandardScaler().fit_transform(df_X[['balance']])

# day minmax scaler
df_X['day'] = MinMaxScaler().fit_transform(df_X[['day']])

# duration use standardScaler
df_X['duration'] = StandardScaler().fit_transform(df_X[['duration']])

# campaign use robustScaler
df_X['campaign'] = RobustScaler().fit_transform(df_X[['campaign']])

# pdays use standardScaler
df_X['pdays'] = StandardScaler().fit_transform(df_X[['pdays']])

# previous use standardScaler
df_X['previous'] = StandardScaler().fit_transform(df_X[['previous']])

print(df_X.head())



# Assuming `preprocessed_df` is your manually preprocessed DataFrame
k = 3  # Number of clusters

# Initialize the KMeans model
kmeans = KMeans(n_clusters=k, random_state=42)

# Fit the model to the preprocessed data
kmeans.fit(df_X)

# Get the cluster labels for each data point
cluster_labels = kmeans.labels_

# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

# Add the cluster labels to the original DataFrame
df['cluster'] = cluster_labels

# extract the processed data as a numpy array
preprocessed_data = df_X.to_numpy()
# combine the preprocessed data and cluster centers
data_and_centers = np.r_[preprocessed_data, cluster_centers]
# apply MDS to the combined data
XYcoordinates = manifold.MDS(n_components=2, normalized_stress='auto').fit_transform(data_and_centers)
print("Transformation complete")

# Determine the number of points in XYcoordinates.
n_points = XYcoordinates.shape[0]

# use clustering scatterplot function to visualize the clusters
clustering_scatterplot(points=XYcoordinates[:-k, :], 
                       labels=cluster_labels[:n_points-k], 
                       centers=XYcoordinates[-k:, :], 
                       title="KMeans Clustering with MDS")

plt.show()

# add the cluster labels to the original DataFrame
df['cluster'] = cluster_labels

# pivot the subscribed column to see the distribution of subscription status in each cluster
df_pivot = df.pivot_table(index='cluster', columns='subscribed', aggfunc='size', fill_value=0)
df_pivot = df_pivot.reset_index()
df_pivot.columns.name = None  # Remove the name of the columns index
df_pivot.columns = ['cluster', 'no', 'yes']  # Rename the columns
df_pivot['total'] = df_pivot['no'] + df_pivot['yes']
df_pivot['yes_percentage'] = df_pivot['yes'] / df_pivot['total'] * 100
df_pivot['no_percentage'] = df_pivot['no'] / df_pivot['total'] * 100
df_pivot = df_pivot.sort_values(by='yes_percentage', ascending=False)
df_pivot.reset_index(drop=True, inplace=True)
print(df_pivot)

# Replace the problematic sections with this code

from sklearn.manifold import TSNE  # Import TSNE

# Initialize t-SNE
tsne = TSNE(n_components=2, random_state=42)

# Combine the preprocessed data and cluster centers before applying t-SNE
combined_data = np.vstack([preprocessed_data, cluster_centers])

# Apply t-SNE to the combined data
tsne_results_combined = tsne.fit_transform(combined_data)

# Split the results back into data points and centers
tsne_data_points = tsne_results_combined[:-k]
tsne_centers = tsne_results_combined[-k:]

# No need for data_and_centers_tsne - just use the results directly
print("t-SNE transformation complete")

# Use clustering scatterplot function to visualize the clusters
clustering_scatterplot(points=tsne_data_points, 
                      labels=cluster_labels, 
                      centers=tsne_centers, 
                      title="KMeans Clustering with t-SNE")
plt.show()


# apply elbow method to find the optimal number of clusters
def elbow_method(data, max_k):
    sse = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        sse.append(kmeans.inertia_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_k + 1), sse, marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('SSE')
    plt.grid()
    plt.show()


# Call the elbow method function
elbow_method(preprocessed_data, max_k=10)
