In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from numpy import nan
from numpy import isnan
from sklearn.preprocessing import StandardScaler

dataset = pd.read_csv('data/credit-card-holder-data/cc-general.csv')

if dataset['CUST_ID'].is_unique:
    dataset.set_index('CUST_ID', inplace=True)
    print("CUST_ID is unique and is now the index.")
else:
    dataset.drop('CUST_ID', axis=1, inplace=True)
    print("CUST_ID was not unique and has been removed.")

imputer = SimpleImputer(missing_values=nan, strategy='mean')
imputed_dataset = imputer.fit_transform(dataset)
# when imputing data we loose column names, adding them back in.
imputed_dataset = pd.DataFrame(imputed_dataset, columns=dataset.columns)


scaled_imputed_dataset = StandardScaler().fit_transform(imputed_dataset)
print("Scaled Imputed Dataset Shape : ", scaled_imputed_dataset.shape)

CUST_ID is unique and is now the index.
Scaled Imputed Dataset Shape :  (8950, 17)


In [8]:
import numpy as np
import skfuzzy as fuzz
from itertools import product
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import os


def run_cmeans_tests(dataset, params, plot=False, location='/tmp/cmenas-data'):

    param_combinations = product(params['error'],
                                 params['fuzziness_coefficient'])
    
    print("Param Combinations : ", param_combinations)

    data = dataset.T
    
    num_combinations = len(list(param_combinations))
    run = 0

    param_combinations = product(params['error'],
                                 params['fuzziness_coefficient'])


    for error, fuzziness_coefficient in param_combinations:
        sil_scores = [] # silhouette score
        ch_scores = []  # calinski harabasz score
        db_scores = []  # davies bouldin score
        fpc_scores = []

        run = 1 + run

        print('')
        print('------------------------------------------------------------------------------------------------')
        print(f'Run {run}/{num_combinations}  Now running combo : error {error} - fuzziness coefficient {fuzziness_coefficient}')
        print('------------------------------------------------------------------------------------------------')
        print('') 
        
        for i in list(range(2,params['number_of_clusters'])):

            _, u, _, _, _, _, fpc = fuzz.cluster.cmeans(data,c=i, m=fuzziness_coefficient, error=error, maxiter=1000, seed=111, init=None)
            labels = np.argmax(u, axis=0)

            

            sil_score = silhouette_score(scaled_imputed_dataset, labels)
            ch_score = calinski_harabasz_score(scaled_imputed_dataset, labels)
            db_score = davies_bouldin_score(scaled_imputed_dataset, labels)
            
            fpc_scores.append(fpc)
            sil_scores.append(sil_score)
            ch_scores.append(ch_score)
            db_scores.append(db_score)

            print(f'Number of clusters {i} : Silhouette Score {sil_score} : Calinski Harabasz Score {ch_score} : Davies Bouldin Score {db_score} : fpc {fpc}')


        cluster_range = range(2, params['number_of_clusters'])
        if plot == True:
            plt.figure(figsize=(5, 10)) # width, height    
            
            plt.subplot(411)
            plt.plot(cluster_range, sil_scores, marker='o')
            plt.title('silhouette score vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('silhouette score')

            plt.subplot(412)
            plt.plot(cluster_range, ch_scores, marker='o', color='r')
            plt.title('calinski-harabasz score vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('calinski-harabasz score')

            plt.subplot(413)
            plt.plot(cluster_range, db_scores, marker='o', color='g')
            plt.title('davies-bouldin score vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('davies-bouldin score')

            plt.subplot(414)
            plt.plot(cluster_range, fpc_scores, marker='o', color='g')
            plt.title('fuzzy partition coefficient vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('fuzzy partition coefficient')

            filename = f"noc-{params['number_of_clusters']}--error-{error}--fc-{fuzziness_coefficient}.png"
            if not os.path.exists(location):
                os.makedirs(location)
            
            plt.subplots_adjust(hspace=.5, wspace=.5)
            plt.savefig(f'{location}/{filename}')
            plt.close()



In [17]:
from sklearn.cluster import KMeans
from itertools import product
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os


def run_kmeans_tests(dataset, params, plot=False, location='/tmp/kmenas-data'):


    param_combinations = product(params['init'],
                                 params['n_init'],
                                 params['tol'],
                                 params['algo'])
    
    num_combinations = len(list(param_combinations))
    run = 0

    param_combinations = product(params['init'],
                                 params['n_init'],
                                 params['tol'],
                                 params['algo'])
    
    for init, n_init, tol, algo in param_combinations:
        inertia_scores = [] # list of inertia scores
        sil_scores = []     # list of silhouette score
        ch_scores = []      # list of calinski harabasz score
        db_scores = []      # list of davies bouldin score

        run = 1 + run
        
        print('')
        print('------------------------------------------------------------------------------------------------')
        print(f'Run {run}/{num_combinations}  Now running combo : init {init} - n_init {n_init} - tol {tol} - algo {algo}')
        print('------------------------------------------------------------------------------------------------')
        print('')        

        for i in list(range(2,params['number_of_clusters'])):

            kmeans = KMeans(n_clusters = i ,init=init, n_init=n_init, max_iter=500, tol=tol,  random_state= 111, algorithm=algo)
            kmeans.fit(dataset)

            sil_score = silhouette_score(scaled_imputed_dataset, kmeans.labels_)
            ch_score = calinski_harabasz_score(scaled_imputed_dataset, kmeans.labels_)
            db_score = davies_bouldin_score(scaled_imputed_dataset, kmeans.labels_)
            
            inertia_scores.append(kmeans.inertia_)
            sil_scores.append(sil_score)
            ch_scores.append(ch_score)
            db_scores.append(db_score)

            print(f'Number of clusters {i} : Silhouette Score {sil_score} : Calinski Harabasz Score {ch_score} : Davies Bouldin Score {db_score}')


        cluster_range = range(2, params['number_of_clusters'])
        if plot == True:
            plt.figure(figsize=(5.5, 10)) # width, height            

            ax1 = plt.subplot(411)
            ax1.plot(cluster_range, inertia_scores, marker='o', color='r')
            ax1.set_title('inertia score vs number of clusters')
            ax1.set_xlabel('number of clusters')
            ax1.set_ylabel('inertia score')
            ax1.yaxis.set_major_formatter(ticker.EngFormatter()) 

            plt.subplot(412)
            plt.plot(cluster_range, sil_scores, marker='o')
            plt.title('silhouette score vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('silhouette score')

            plt.subplot(413)
            plt.plot(cluster_range, ch_scores, marker='o', color='r')
            plt.title('calinski-harabasz score vs number of clusters')
            plt.xlabel('number of nlusters')
            plt.ylabel('calinski-harabasz score')

            plt.subplot(414)
            plt.plot(cluster_range, db_scores, marker='o', color='g')
            plt.title('davies-bouldin score vs number of clusters')
            plt.xlabel('number of clusters')
            plt.ylabel('davies-bouldin score')

            filename = f"noc-{params['number_of_clusters']}--init-{init}--n_init-{n_init}--tol-{tol}--algo-{algo}.png"
            if not os.path.exists(location):
                os.makedirs(location)
            
            plt.subplots_adjust(hspace=1, wspace=1)
            plt.savefig(f'{location}/{filename}')
            plt.close()

In [10]:
cmeans_params = {
    "number_of_clusters" : 9 ,
    "error" : [ 0.005 , 0.009, 0.05, 0.5],
    "fuzziness_coefficient" : [ 1.1, 1.2, 1.3, 1.5, 1.6, 1.7, 1.8, 1.9, 2 , 3 , 4 ]
}

run_cmeans_tests(scaled_imputed_dataset, params=cmeans_params, plot=True, location='/tmp/data/cmeans/one')

Param Combinations :  <itertools.product object at 0xffffb019f100>

------------------------------------------------------------------------------------------------
Run 1/44  Now running combo : error 0.005 - fuzziness coefficient 1.1
------------------------------------------------------------------------------------------------



Number of clusters 2 : Silhouette Score 0.20913837075644592 : Calinski Harabasz Score 1706.0199290784296 : Davies Bouldin Score 1.9124332513936189 : fpc 0.9409512971602996
Number of clusters 3 : Silhouette Score 0.18235319446218642 : Calinski Harabasz Score 1568.448001466996 : Davies Bouldin Score 1.7623566606332652 : fpc 0.9331743204987715
Number of clusters 4 : Silhouette Score 0.19688309330074086 : Calinski Harabasz Score 1597.2444026025203 : Davies Bouldin Score 1.5799691428441558 : fpc 0.9325525739088946
Number of clusters 5 : Silhouette Score 0.19154991179340866 : Calinski Harabasz Score 1455.6430894338262 : Davies Bouldin Score 1.4733168017822598 : fpc 0.9280125406942286
Number of clusters 6 : Silhouette Score 0.2030842620208946 : Calinski Harabasz Score 1360.8552564605218 : Davies Bouldin Score 1.3525479548332708 : fpc 0.9388974677071411
Number of clusters 7 : Silhouette Score 0.21380851808084933 : Calinski Harabasz Score 1361.0125748342075 : Davies Bouldin Score 1.397548204105

In [18]:
kmeans_params = {
    "number_of_clusters" : 9 ,
    "init" : [ 'k-means++', 'random' ],
    "n_init" : [ 1, 3, 5, 7, 9, 11, 13, 15 ],
    "tol" : [ 0.1, 0.0001, 0.00001 ],
    "algo" : [ 'lloyd', 'elkan' ]
}

run_kmeans_tests(scaled_imputed_dataset, params=kmeans_params, plot=True, location='/tmp/data/kmeans/one')


------------------------------------------------------------------------------------------------
Run 1/96  Now running combo : init k-means++ - n_init 1 - tol 0.1 - algo lloyd
------------------------------------------------------------------------------------------------

Number of clusters 2 : Silhouette Score 0.3375076523389319 : Calinski Harabasz Score 1589.4806935953684 : Davies Bouldin Score 1.6825815308831145
Number of clusters 3 : Silhouette Score 0.2448468714779771 : Calinski Harabasz Score 1603.582407540481 : Davies Bouldin Score 1.6096987720282334
Number of clusters 4 : Silhouette Score 0.1926670665209963 : Calinski Harabasz Score 1589.6010814890308 : Davies Bouldin Score 1.6017404545174736
Number of clusters 5 : Silhouette Score 0.1902322447909543 : Calinski Harabasz Score 1469.478538591397 : Davies Bouldin Score 1.479043631345901
Number of clusters 6 : Silhouette Score 0.17497084240217345 : Calinski Harabasz Score 1310.0534750733111 : Davies Bouldin Score 1.63418405887405

In [16]:
# testing kmeans default params
kmeans_params = {
    "number_of_clusters" : 9 ,
    "init" : [ 'k-means++' ],
    "n_init" : [ 1 ],
    "tol" : [ 0.0001 ],
    "algo" : [ 'lloyd' ]
}
run_kmeans_tests(scaled_imputed_dataset, params=kmeans_params, plot=True, location='/tmp/data/kmeans/two')


------------------------------------------------------------------------------------------------
Run 1/1  Now running combo : init k-means++ - n_init 1 - tol 0.0001 - algo lloyd
------------------------------------------------------------------------------------------------

Number of clusters 2 : Silhouette Score 0.27991297227096823 : Calinski Harabasz Score 1609.2327121291971 : Davies Bouldin Score 1.8000347099419787
Number of clusters 3 : Silhouette Score 0.25024795274747624 : Calinski Harabasz Score 1604.420308678222 : Davies Bouldin Score 1.5984540971640675
Number of clusters 4 : Silhouette Score 0.1974482519028498 : Calinski Harabasz Score 1597.4890120045775 : Davies Bouldin Score 1.5760581389736181
Number of clusters 5 : Silhouette Score 0.18963969210537016 : Calinski Harabasz Score 1469.7617035721971 : Davies Bouldin Score 1.4797351799242295
Number of clusters 6 : Silhouette Score 0.19737761940537168 : Calinski Harabasz Score 1408.3466149536869 : Davies Bouldin Score 1.4481389