In [1]:
import os, sys

import numpy as np
import pandas as pd

from os.path import join

from library import SBKMeans, ErrorChecker
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = True
need_standardization = False
need_pca = True
filename = 'total_data_na.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 5
num_iterations = 300

In [3]:
df = pd.read_csv(datapath, low_memory=False)

In [4]:
print(len(df))

143


In [5]:
print(df.head())

            PLAYER  Mat.x  Inns.x  NO  Runs.x  HS  Avg.x   BF    SR.x  X100  \
0      Aaron Finch     10       9   1     134  46  16.75  100  134.00     0   
1   AB de Villiers     12      11   2     480  90  53.33  275  174.54     0   
2  Abhishek Sharma      3       3   2      63  46     63   33  190.90     0   
3   Ajinkya Rahane     15      14   1     370  65  28.46  313  118.21     0   
4       Alex Hales      6       6   0     148  45  24.66  118  125.42     0   

   ...   Ov  Runs.y  Wkts  BBI  Avg.y  Econ  SR.y  X4w  X5w  y  
0  ...  0.0       0     0    0      0   0.0     0    0    0  0  
1  ...  0.0       0     0    0      0   0.0     0    0    0  0  
2  ...  0.0       0     0    0      0   0.0     0    0    0  0  
3  ...  0.0       0     0    0      0   0.0     0    0    0  0  
4  ...  0.0       0     0    0      0   0.0     0    0    0  0  

[5 rows x 25 columns]


In [6]:
print(df.dtypes)

PLAYER     object
Mat.x       int64
Inns.x      int64
NO          int64
Runs.x      int64
HS          int64
Avg.x      object
BF          int64
SR.x      float64
X100        int64
X50         int64
X4s         int64
X6s         int64
Mat.y       int64
Inns.y      int64
Ov        float64
Runs.y      int64
Wkts        int64
BBI         int64
Avg.y      object
Econ      float64
SR.y       object
X4w         int64
X5w         int64
y           int64
dtype: object


In [7]:
df = df[df.columns.tolist()[1:]]
for col in df.columns.tolist():
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))

143


In [8]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [9]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [10]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [11]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [12]:
data = X
print(data[:5])
print(data.shape)

[[ 0.72042344 -2.36568164]
 [-2.73225787 -1.89400323]
 [ 0.07945916  1.40378715]
 [ 7.37203169  1.40441952]
 [-1.72141235 -3.17838008]]
(143, 2)


In [13]:
sbkmeans = SBKMeans(
    n_clusters=num_clusters,
    n_iters=num_iterations,
)

In [14]:
centers = sbkmeans.fit(data)

Initial centers:
[[-1.89000451e+00 -1.32695409e+00]
 [ 7.37203169e+00  1.40441952e+00]
 [-1.58452328e+00  2.16521354e+00]
 [ 1.78368273e+00 -1.71882952e+00]
 [ 5.80628650e+00  3.47656563e-03]]


In [15]:
print(sbkmeans.centers)

[[-2.68738893 -0.05409887]
 [ 5.18611986  0.2593172 ]
 [-0.70763264  3.67203206]
 [-0.58322841 -2.45591975]
 [ 1.80561361 -1.78988086]]


In [16]:
labels = sbkmeans.predict(data)

In [17]:
print(labels[:5])

[4, 0, 2, 1, 3]


In [18]:
error_checker = ErrorChecker(
    X=data,
    centers=centers,
    labels=labels,
)

In [19]:
dist_total = error_checker.potential_function()

In [20]:
print(dist_total)

19327.897957343488


In [21]:
kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    init='k-means++',
    algorithm='auto',
#     n_init=50,
)

In [22]:
labels = kmeans.fit_predict(data)

In [23]:
error_checker = ErrorChecker(
    data,
    kmeans.cluster_centers_,
    kmeans.labels_,
)

In [24]:
dist_total = error_checker.potential_function()

In [25]:
print(dist_total)

21738.439210085406
