In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from numpy import nan
from numpy import isnan
from sklearn.preprocessing import StandardScaler

dataset = pd.read_csv('data/credit-card-holder-data/cc-general.csv')

if dataset['CUST_ID'].is_unique:
    dataset.set_index('CUST_ID', inplace=True)
    print("CUST_ID is unique and is now the index.")
else:
    dataset.drop('CUST_ID', axis=1, inplace=True)
    print("CUST_ID was not unique and has been removed.")

imputer = SimpleImputer(missing_values=nan, strategy='mean')
imputed_dataset = imputer.fit_transform(dataset)
# when imputing data we loose column names, adding them back in.
imputed_dataset = pd.DataFrame(imputed_dataset, columns=dataset.columns)


scaled_imputed_dataset = StandardScaler().fit_transform(imputed_dataset)
print("Scaled Imputed Dataset Shape : ", scaled_imputed_dataset.shape)

CUST_ID is unique and is now the index.
Scaled Imputed Dataset Shape :  (8950, 17)


In [2]:
from sklearn.cluster import KMeans
# kmeans-noc-9--init-k-means++--n_init-15--tol-1e-05--algo-elkan
kmeans = KMeans(n_clusters = 3 ,init='k-means++', n_init = 15 ,max_iter=500, tol=0.00001,  random_state= 111, algorithm='elkan')
kmeans.fit(scaled_imputed_dataset)

In [4]:
import matplotlib.pyplot as plt
from itertools import combinations
import numpy as np
import os

columns = imputed_dataset.columns

column_combinations = combinations(columns, 3)

num_combinations = len(list(column_combinations))
run = 0

column_combinations = combinations(columns, 3)

for combo in column_combinations:
    run = 1 + run
    x, y, z = combo
    print('')
    print('------------------------------------------------------------------------------------------------')
    print(f'Run {run}/{num_combinations}  Now running combo : x {x} - y {y} - z {z}')
    print('------------------------------------------------------------------------------------------------')
    print('') 

    
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(projection='3d')
    

    scatter = ax.scatter(imputed_dataset[x], imputed_dataset[y], imputed_dataset[z], c=kmeans.labels_, cmap='viridis', s=20)

    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)

    unique_labels = np.unique(kmeans.labels_)
    legend_labels = {}

    for label in unique_labels:
        value = f'Cluster {label}'
        legend_labels[label] = value

    legend_handles = []

    for label, legend in legend_labels.items():
        color = plt.cm.viridis(label / max(kmeans.labels_))
        handle = plt.Line2D([0], [0], marker='o', color='w', label=legend, markerfacecolor=color, markersize=10)
        legend_handles.append(handle)

    ax.legend(handles=legend_handles, title="Clusters")


    filename = f"{x}_{y}_{z}.png"
    if not os.path.exists('/tmp/data/3d-data/'):
        os.makedirs('/tmp/data/3d-data/')
    plt.savefig(f'/tmp/data/3d-data/{filename}')
    plt.close()

print("All plots with legends are saved.")


------------------------------------------------------------------------------------------------
Run 1/680  Now running combo : x BALANCE - y BALANCE_FREQUENCY - z PURCHASES
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 2/680  Now running combo : x BALANCE - y BALANCE_FREQUENCY - z ONEOFF_PURCHASES
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 3/680  Now running combo : x BALANCE - y BALANCE_FREQUENCY - z INSTALLMENTS_PURCHASES
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 4/680  Now running combo : x BALANCE - y BALANCE_FREQUENCY 

In [5]:
import matplotlib.pyplot as plt
from itertools import combinations
import numpy as np
import os

columns = imputed_dataset.columns

column_combinations = combinations(columns, 2)

num_combinations = len(list(column_combinations))
run = 0

column_combinations = combinations(columns, 2)

for combo in column_combinations:
    run = 1 + run
    x, y = combo
    print('')
    print('------------------------------------------------------------------------------------------------')
    print(f'Run {run}/{num_combinations}  Now running combo : x {x} - y {y}')
    print('------------------------------------------------------------------------------------------------')
    print('') 

    
    fig, ax = plt.subplots(figsize=(12, 8))

    scatter = ax.scatter(imputed_dataset[x], imputed_dataset[y], c=kmeans.labels_, cmap='viridis', s=20)

    ax.set_xlabel(x)
    ax.set_ylabel(y)

    unique_labels = np.unique(kmeans.labels_)
    legend_labels = {}

    for label in unique_labels:
        value = f'Cluster {label}'
        legend_labels[label] = value

    legend_handles = []

    for label, legend in legend_labels.items():
        color = plt.cm.viridis(label / max(kmeans.labels_))
        handle = plt.Line2D([0], [0], marker='o', color='w', label=legend, markerfacecolor=color, markersize=10)
        legend_handles.append(handle)
    
    ax.legend(handles=legend_handles, title="Clusters")

    filename = f"{x}_{y}.png"
    if not os.path.exists('/tmp/data/2d-data/'):
        os.makedirs('/tmp/data/2d-data/')
    plt.savefig(f'/tmp/data/2d-data/{filename}')
    plt.close()

print("All 2D plots with legends are saved.")


------------------------------------------------------------------------------------------------
Run 1/136  Now running combo : x BALANCE - y BALANCE_FREQUENCY
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 2/136  Now running combo : x BALANCE - y PURCHASES
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 3/136  Now running combo : x BALANCE - y ONEOFF_PURCHASES
------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------
Run 4/136  Now running combo : x BALANCE - y INSTALLMENTS_PURCHASES
------------------------------------------------------------------