In [1]:
import os, sys

import numpy as np
import pandas as pd

from os.path import join

from ikmeans import IKMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = True
need_pca = True
filename = 'Mall_customers.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 3
num_iterations = 20

In [3]:
# f = open('data/data.csv')
# lines = f.read()
# print(type(lines))
# new_lines = ''
# for line in lines:
#     new_lines += str(line.encode('utf-8').strip())
# print(len(new_lines))

In [4]:
df = pd.read_csv(datapath, low_memory=False, encoding='utf-8')

In [5]:
print(len(df))

200


In [6]:
print(df.head())

   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


In [7]:
print(df.dtypes)

CustomerID                 int64
Genre                     object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object


In [8]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
# cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
cols = columns[2:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

200
Age                       int64
Annual Income (k$)        int64
Spending Score (1-100)    int64
dtype: object


In [9]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [10]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [11]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [12]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [13]:
data = X
print(data[:5])
print(data.shape)

[[-0.16911371  0.14206818]
 [ 0.47574107 -0.24308493]
 [ 0.39964286  0.12035475]
 [ 0.33201674 -0.43457463]
 [ 0.10246773  0.07535735]]
(200, 2)


In [14]:
models = ['ikmeans', 'kmeans']

In [15]:
ikmeans = IKMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    verbose=0,
)

In [16]:
kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    init='k-means++',
    algorithm='auto',
#     n_init=50,
    verbose=0,
)

In [17]:
algorithms = {
    'kmeans': kmeans,
    'ikmeans': ikmeans,
}

In [None]:
for model in models:
    algorithm = algorithms[model].fit(data)
    centers = algorithm.cluster_centers_
    labels = algorithms[model].predict(data)
    fig, ax = plt.subplots()
    print(int(algorithms[model].inertia_))
    scatter = plt.scatter(
        X[:, 0],
        X[:, 1],
        c=labels,
    )
    handles, labels = scatter.legend_elements()
    legend = ax.legend(
        handles,
        labels,
        loc='upper right',
        title='Label',
    )
    ax.add_artist(legend)
    plt.title(model+' '+str(num_clusters)+ ' clusters')
    plt.xlabel('Component A')
    plt.ylabel('Component B')
    plt.show()

9