#### Machine learning tasks

##### Supervised learning (we know the class or value of our training data)
* classification (predicting the class of a data point) - decision_tree_classifier, mlp_classifier, kneighbours
* regression (predicting the value of a data point) - decision_tree_regressor, mlp_regressor
* time series forecasting - arima, ...

##### Unsupervised learning (we do not know the class or value of our training data)
* clustering (assigning clusters to data points based on proximity / similarity) - kmeans

###### kmeans
* k is an input -- this is how many clusters you want
* initially places k cluster centers randomly
* each data point is assigned to the closest cluster center
* cluster center is moved to shorten distances with assigned data points
* data points are reeximined and assigned to closest relocated cluster center
* cluster center is moved again to shorten distances with assigned data points
* this repeats until the assignments do not change (convergence)
* goal is lower the average distance between points in a cluster and their cluster center


`* data points`
`+ cluster centers`
`k = 2`
| *1     *2   *2
| +1       +2
| *1       
| *1       *2
-----------
 

 https://www.dannyadam.com/blog/2019/07/kmeans1d-globally-optimal-efficient-1d-k-means/

 within cluster variance

Token:
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

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

In [3]:
columns = ['user_key', 'hashvalue']
df = pd.read_csv('../assignments/kakao_cluster_data.csv', names=columns)
print(len(df))

k = [500, 1000, 10000]

10000000


In [4]:
df.sort_values(by='hashvalue', inplace=True)
df = df.reset_index(drop=True)
df['rank'] = df['hashvalue'].rank(method='dense')
print(len(df))
display(df.head())

10000000


Unnamed: 0,user_key,hashvalue,rank
0,1bc820ac5e295c68d696a32da57b65e8,635656006,1.0
1,36053bc85af28f528e35f5c72b4c4186,1944196966,2.0
2,1c1fbe6ab4b02d34b3275c10f812c3c6,1944196966,2.0
3,15dc58cd35a16cedac13fb0dbbddd296,3055813318,3.0
4,07477b17b5d618909bcd65fb10e8105c,3080390514,4.0


In [5]:
df_test = df[:1000000].copy()
display(df_test.head())

Unnamed: 0,user_key,hashvalue,rank
0,1bc820ac5e295c68d696a32da57b65e8,635656006,1.0
1,36053bc85af28f528e35f5c72b4c4186,1944196966,2.0
2,1c1fbe6ab4b02d34b3275c10f812c3c6,1944196966,2.0
3,15dc58cd35a16cedac13fb0dbbddd296,3055813318,3.0
4,07477b17b5d618909bcd65fb10e8105c,3080390514,4.0


#### Compare within-cluster variance (inertia) and run times of kmeans1d, kmeans, & kmeans with custom init to set initial centroids

In [6]:
%%timeit

import kmeans1d

df_1d = df_test.copy()
n_clusters = 100

clusters, centroids = kmeans1d.cluster(df_1d['hashvalue'], n_clusters)
df_1d[f'kmeans_10'] = clusters
# display(df_1d)

inertia_ = 0
for i in range(len(df_1d['hashvalue'])):
    center = centroids[clusters[i]]
    distance = np.sum((df_1d['hashvalue'].iloc[i] - center) ** 2)
    inertia_ += distance

counts = df_1d['kmeans_10'].value_counts()
print(f"inertia: {int(inertia_/len(df_1d['hashvalue']))}")
print(f"number_of_clusters: {df_1d['kmeans_10'].nunique()}")
print(f"counts_per_cluster:\n{counts}")
print(f"min_value: {min(counts.values)}")
print(f"max_value: {np.max(counts.values)}")
print(f"total_counts: {sum(counts.values)}")

inertia: 37581746335667896451072
number_of_clusters: 100
counts_per_cluster:
kmeans_10
56    115503
48     79402
41     70274
72     60327
62     40297
       ...  
15      1687
18      1640
46      1458
45      1349
19      1292
Name: count, Length: 100, dtype: int64
min_value: 1292
max_value: 115503
total_counts: 1000000
inertia: 37581746335667896451072
number_of_clusters: 100
counts_per_cluster:
kmeans_10
56    115503
48     79402
41     70274
72     60327
62     40297
       ...  
15      1687
18      1640
46      1458
45      1349
19      1292
Name: count, Length: 100, dtype: int64
min_value: 1292
max_value: 115503
total_counts: 1000000
inertia: 37581746335667896451072
number_of_clusters: 100
counts_per_cluster:
kmeans_10
56    115503
48     79402
41     70274
72     60327
62     40297
       ...  
15      1687
18      1640
46      1458
45      1349
19      1292
Name: count, Length: 100, dtype: int64
min_value: 1292
max_value: 115503
total_counts: 1000000
inertia: 3758174633566789

In [None]:
%%timeit

from sklearn.cluster import KMeans

df_kmeans = df_test.copy()
n_clusters = 100

kmeans = KMeans(n_clusters=n_clusters, random_state=True)
kmeans.fit(df_kmeans[['hashvalue']])
clusters = kmeans.labels_
df_kmeans[f'kmeans_10'] = clusters
# display(df_kmeans)

counts = df_kmeans['kmeans_10'].value_counts()
print(f"inertia: {int(kmeans.inertia_/len(df_kmeans['hashvalue']))}")
print(f"number_of_clusters: {df_kmeans['kmeans_10'].nunique()}")
print(f"counts_per_cluster:\n{counts}")
print(f"min_value: {min(counts.values)}")
print(f"max_value: {np.max(counts.values)}")
print(f"total_counts: {sum(counts.values)}")

  super()._check_params_vs_input(X, default_n_init=10)


inertia: 39984861634750375264256
number_of_clusters: 100
counts_per_cluster:
kmeans_10
5     115543
0      79092
8      70275
18     60367
14     40297
       ...  
85      1508
59      1395
98      1286
72       931
48       929
Name: count, Length: 100, dtype: int64
min_value: 929
max_value: 115543
total_counts: 1000000


  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
%%timeit

from sklearn.cluster import KMeans

def custom_init(X, n_clusters, random_state=True):
    max = int(X['rank'].max())
    step = int(max/n_clusters)
    start = int(step/2 - 1)
    selected_rows = X[(X['rank'] >= start) & (X['rank'] % step == start % step)]
    initial_centroids = selected_rows['hashvalue']
    return initial_centroids.unique().reshape(-1,1)

df_init = df_test.copy()
n_clusters = 100

kmeans_init = KMeans(n_clusters=n_clusters, init=custom_init(df_init, n_clusters, random_state=True), n_init=1)
kmeans_init.fit(df_init[['hashvalue']])
clusters = kmeans_init.labels_
df_init[f'kmeans_10'] = clusters
# display(df_init)

counts = df_init['kmeans_10'].value_counts()
print(f"inertia: {int(kmeans_init.inertia_/len(df_init['hashvalue']))}")
print(f"number_of_clusters: {df_init['kmeans_10'].nunique()}")
print(f"counts_per_cluster:\n{counts}")
print(f"min_value: {min(counts.values)}")
print(f"max_value: {np.max(counts.values)}")
print(f"total_counts: {sum(counts.values)}")

inertia: 77968163971352762515456
number_of_clusters: 100
counts_per_cluster:
37    115431
31     84216
26     71641
57     60241
23     52106
       ...  
12      2468
29      2271
94      2239
76      1879
70      1851
Name: kmeans_10, Length: 100, dtype: int64
min_value: 1851
max_value: 115431
total_counts: 1000000
inertia: 77968163971352762515456
number_of_clusters: 100
counts_per_cluster:
37    115431
31     84216
26     71641
57     60241
23     52106
       ...  
12      2468
29      2271
94      2239
76      1879
70      1851
Name: kmeans_10, Length: 100, dtype: int64
min_value: 1851
max_value: 115431
total_counts: 1000000
inertia: 77968163971352762515456
number_of_clusters: 100
counts_per_cluster:
37    115431
31     84216
26     71641
57     60241
23     52106
       ...  
12      2468
29      2271
94      2239
76      1879
70      1851
Name: kmeans_10, Length: 100, dtype: int64
min_value: 1851
max_value: 115431
total_counts: 1000000
inertia: 77968163971352762515456
number_of_

#### Test in for loop to find n_clusters that raises datapoints above threshold 

In [None]:
import kmeans1d

df_loop_1d = df_test
k_test = 10000
start_clusters = 100

clusters, centroids = kmeans1d.cluster(df_loop_1d['hashvalue'], start_clusters)
df_loop_1d[f'kmeans_10'] = clusters
value_counts = df_loop_1d['kmeans_10'].value_counts()
print(f"new_clusters: {start_clusters}")
print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
print(df_loop_1d['kmeans_10'].value_counts())

if len(value_counts[value_counts < k_test].values) == 0:
    print(f"DONE: {len(centroids)} clusters")

else:
    while (len(value_counts[value_counts < k_test].values)) > 0:
        if (len(value_counts[value_counts < k_test].values)) > int(0.2 * start_clusters):
            new_clusters = len(centroids) - int(0.2 * start_clusters)
        elif (len(value_counts[value_counts < k_test].values)) > int(0.1 * start_clusters):
            new_clusters = len(centroids) - int(0.1 * start_clusters)
        else:
            new_clusters = len(centroids) - 1
        print(f"new_clusters: {new_clusters}")
        clusters, centroids = kmeans1d.cluster(df_loop_1d['hashvalue'], new_clusters)
        df_loop_1d[f'kmeans_10'] = clusters
        value_counts = df_loop_1d['kmeans_10'].value_counts()
        print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
        print(df_loop_1d['kmeans_10'].value_counts())
    print(f"DONE: {len(centroids)} clusters")

new_clusters: 100
n_clusters < k: 79
56    115503
48     79402
41     70274
72     60327
62     40297
       ...  
15      1687
18      1640
46      1458
45      1349
19      1292
Name: kmeans_10, Length: 100, dtype: int64
new_clusters: 80
n_clusters < k: 57
44    123992
38     79440
32     70757
57     61583
48     40369
       ...  
18      2338
9       2336
17      2152
36      2111
16      1745
Name: kmeans_10, Length: 80, dtype: int64
new_clusters: 60
n_clusters < k: 36
33    128592
29     84254
24     71857
43     65743
22     55622
37     43472
54     40414
50     33152
30     30989
59     29809
56     20364
53     20036
46     16469
34     16360
18     16197
40     15330
0      14978
52     14033
38     13819
25     13523
35     12288
3      12284
36     10521
31     10109
58      9696
57      9519
44      9507
39      9001
2       8326
1       8217
17      7964
32      7572
51      7400
49      7384
7       7372
26      7288
48      7131
55      6598
20      6550
16      6397


In [None]:
from sklearn.cluster import KMeans

df_loop_kmeans = df_test
k_test = 10000
start_clusters = 100

kmeans = KMeans(n_clusters=start_clusters, random_state=True)
kmeans.fit(df_loop_kmeans[['hashvalue']])
clusters = kmeans.labels_
centroids = kmeans.cluster_centers_
df_loop_kmeans[f'kmeans_10'] = clusters
value_counts = df_loop_kmeans['kmeans_10'].value_counts()
print(f"new_clusters: {start_clusters}")
print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
print(df_loop_kmeans['kmeans_10'].value_counts())

if len(value_counts[value_counts < k_test].values) == 0:
    print(f"DONE: {len(centroids)} clusters")

else:
    while (len(value_counts[value_counts < k_test].values)) > 0:
        if (len(value_counts[value_counts < k_test].values)) > int(0.2 * start_clusters):
            new_clusters = len(centroids) - int(0.2 * start_clusters)
        elif (len(value_counts[value_counts < k_test].values)) > int(0.1 * start_clusters):
            new_clusters = len(centroids) - int(0.1 * start_clusters)
        else:
            new_clusters = len(centroids) - 1
        print(f"new_clusters: {new_clusters}")
        kmeans = KMeans(n_clusters=new_clusters, random_state=True)
        kmeans.fit(df_loop_kmeans[['hashvalue']])
        clusters = kmeans.labels_
        centroids = kmeans.cluster_centers_
        df_loop_kmeans[f'kmeans_10'] = clusters
        value_counts = df_loop_kmeans['kmeans_10'].value_counts()
        print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
        print(df_loop_kmeans['kmeans_10'].value_counts())
    print(f"DONE: {len(centroids)} clusters")

new_clusters: 100
n_clusters < k: 79
7     115503
1      79148
22     70757
9      61542
18     40341
       ...  
42      1438
37      1410
6       1364
91      1345
59      1290
Name: kmeans_10, Length: 100, dtype: int64
new_clusters: 80
n_clusters < k: 56
11    123949
9      79440
21     70747
18     62146
3      40384
       ...  
16      2350
72      2244
38      2109
60      1975
34      1917
Name: kmeans_10, Length: 80, dtype: int64
new_clusters: 60
n_clusters < k: 34
4     128554
6      84255
2      71834
11     65701
10     55659
22     42337
13     38461
30     31167
9      28973
51     28774
19     21137
38     19775
7      16524
52     16298
41     15176
24     14977
5      14928
16     14824
3      14336
50     13891
20     12920
1      12401
18     12273
17     11491
43     10512
54     10252
55      9124
29      8507
34      8496
53      8133
31      8082
48      7842
21      7009
12      6707
35      6603
59      6563
23      6543
39      6417
37      6400
40      6110


In [None]:
from sklearn.cluster import KMeans

df_loop_init = df_test
k_test = 10000
start_clusters = 100

def custom_init(X, n_clusters, random_state=True):
    max = int(X['rank'].max())
    step = int(max/n_clusters)
    start = int(step/2 - 1)
    selected_rows = X[(X['rank'] >= start) & (X['rank'] % step == start % step)]
    initial_centroids = selected_rows['hashvalue']
    return initial_centroids.unique().reshape(-1,1)


kmeans_init = KMeans(n_clusters=start_clusters, init=custom_init(df_loop_init, start_clusters, random_state=True), n_init=1)
kmeans_init.fit(df_loop_init[['hashvalue']])
clusters = kmeans_init.labels_
centroids = kmeans_init.cluster_centers_
df_loop_init[f'kmeans_10'] = clusters
value_counts = df_loop_init['kmeans_10'].value_counts()
print(f"new_clusters: {start_clusters}")
print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
print(df_loop_init['kmeans_10'].value_counts())

if len(value_counts[value_counts < k_test].values) == 0:
    print(f"DONE: {len(centroids)} clusters")

else:
    while (len(value_counts[value_counts < k_test].values)) > 0:
        if (len(value_counts[value_counts < k_test].values)) > int(0.2 * start_clusters):
            new_clusters = len(centroids) - int(0.2 * start_clusters)
        elif (len(value_counts[value_counts < k_test].values)) > int(0.1 * start_clusters):
            new_clusters = len(centroids) - int(0.1 * start_clusters)
        else:
            new_clusters = len(centroids) - 1
        print(f"new_clusters: {new_clusters}")
        kmeans_init = KMeans(n_clusters=new_clusters, init=custom_init(df_loop_init, new_clusters, random_state=True), n_init=1)
        kmeans_init.fit(df_loop_init[['hashvalue']])
        clusters = kmeans_init.labels_
        centroids = kmeans_init.cluster_centers_
        df_loop_init[f'kmeans_10'] = clusters
        value_counts = df_loop_init['kmeans_10'].value_counts()
        print(f"n_clusters < k: {len(value_counts[value_counts < k_test].values)}")
        print(df_loop_init['kmeans_10'].value_counts())
    print(f"DONE: {len(centroids)} clusters")

new_clusters: 100
n_clusters < k: 81
37    115431
31     84216
26     71641
57     60241
23     52106
       ...  
12      2468
29      2271
94      2239
76      1879
70      1851
Name: kmeans_10, Length: 100, dtype: int64
new_clusters: 80
n_clusters < k: 58
30    116418
25     84265
21     84160
46     61305
18     53232
       ...  
68      3076
19      3023
56      2722
45      2565
23      2304
Name: kmeans_10, Length: 80, dtype: int64
new_clusters: 60
n_clusters < k: 32
22    120564
18     87742
16     84920
34     63507
27     42320
41     32804
19     31277
14     29895
13     27911
0      22106
58     20753
11     19311
53     19047
48     17140
37     16566
2      16048
24     14784
30     13830
29     13538
49     12394
35     12017
50     11623
25     11598
20     11498
26     10470
23     10213
1      10167
5      10016
46      9769
17      9540
6       9252
4       9102
12      8964
9       8838
10      8593
44      8322
39      8073
45      7345
56      7114
47      6963


#### Put in main loop across various thresholds

In [None]:
for threshold in k:
    print(f"threshold: {threshold}")
    print(f"n_clusters: {int(len(df)/threshold)}")

threshold: 500
n_clusters: 20000
threshold: 1000
n_clusters: 10000
threshold: 10000
n_clusters: 1000


In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
import math

df_trial = df_test

def custom_init(X, n_clusters, random_state=True):
    max = int(X['rank'].max())
    step = math.floor(max/n_clusters)
    start = int(step/2 - 1)
    selected_rows = X[(X['rank'] >= start) & (X['rank'] % step == start % step)]
    initial_centroids = selected_rows['hashvalue'].unique()[:n_clusters]
    return initial_centroids.reshape(-1,1)


for min_users in k:
    n_clusters = int(len(df_trial)/min_users)
    # kmeans = KMeans(n_clusters=n_clusters, init=custom_init(df_trial, n_clusters, random_state=True), n_init=1)
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, init=custom_init(df_trial, n_clusters, random_state=True), n_init=1)
    kmeans.fit(df_trial[['hashvalue']])
    clusters = kmeans.labels_
    centroids = kmeans.cluster_centers_
    df_trial[f'kmeans'] = clusters
    value_counts = df_trial['kmeans'].value_counts()

    if len(value_counts[value_counts < min_users].values) == 0:
        print(f"DONE: {len(centroids)} clusters")

    else:
        while (len(value_counts[value_counts < min_users].values)) > 0:
            if (len(value_counts[value_counts < min_users].values)) > int(0.2 * n_clusters):
                new_clusters = len(centroids) - int(0.2 * n_clusters)
            elif (len(value_counts[value_counts < min_users].values)) > int(0.1 * n_clusters):
                new_clusters = len(centroids) - int(0.1 * n_clusters)
            else:
                new_clusters = len(centroids) - 1
            # kmeans = KMeans(n_clusters=new_clusters, init=custom_init(df_trial, new_clusters, random_state=True), n_init=1)
            kmeans = MiniBatchKMeans(n_clusters=new_clusters, init=custom_init(df_trial, new_clusters, random_state=True), n_init=1)
            kmeans.fit(df_trial[['hashvalue']])
            clusters = kmeans.labels_
            centroids = kmeans.cluster_centers_
            df_trial[f'kmeans'] = clusters
            value_counts = df_trial['kmeans'].value_counts()
        # print(f"DONE: {len(centroids)} clusters")
    if 'number_of_clusters' not in locals():
        number_of_clusters = {}
        number_of_clusters[min_users] = len(centroids)
    else:
        number_of_clusters[min_users] = len(centroids)

print(f"Number of clusters needed for minimum_users @ {len(df_trial['hashvalue'])} datapoints:")
print(number_of_clusters)

Number of clusters needed for minimum_users @ 1000000 datapoints:
{500: 178, 1000: 146, 10000: 24}


Number of clusters needed for minimum_users @ 1000000 datapoints:
{500: 178, 1000: 146, 10000: 24}


#### Custom init takes ranked dataframe & equally spaces centroids

In [None]:
def custom_init(X, n_clusters, random_state=True):
    
    max = int(X['rank'].max())
    step = int(max/n_clusters)
    start = int(step/2 - 1)
    selected_rows = X[(X['rank'] >= start) & (X['rank'] % step == start % step)]
    initial_centroids = selected_rows['hashvalue']
    # display(selected_rows)    
    print(f"max_rank: {max}")
    print(f"start: {start}")
    print(f"step: {step}")
    print(f"n_centroids: {len(initial_centroids.unique())}")
    return initial_centroids.unique().reshape(-1,1)

X_test = df_test
n_clusters_test = int(len(X_test['hashvalue'])/k[1])
max_test = int(X_test['rank'].max())
step_test = int(max_test/n_clusters_test)
start_test = int(step_test/2 - 1)
print(f"max_rank: {max_test}")
print(f"n_clusters: {n_clusters_test}")
print(f"start: {start_test}")
print(f"step: {step_test}")
selected_rows_test = X_test[(X_test['rank'] >= start_test) & (X_test['rank'] % step_test == start_test % step_test)]
result_values_test = selected_rows_test['hashvalue']
display(selected_rows_test)
initial_centroids_test = result_values_test.unique()
print(f"initial_centroids:\n{initial_centroids_test}")
print(f"n_centroids: {len(result_values_test.unique())}")

array_test = custom_init(X_test, n_clusters_test)
display(array_test)
print(f"array_shape: {array_test.shape}\n")


X = df
n_clusters = int(len(X['hashvalue'])/k[2])
max = int(X['rank'].max())
step = int(max/n_clusters)
start = int(step/2 - 1)
print(f"max_rank: {max}")
print(f"n_clusters: {n_clusters}")
print(f"start: {start}")
print(f"step: {step}")
selected_rows = X[(X['rank'] >= start) & (X['rank'] % step == start % step)]
result_values = selected_rows['hashvalue']
display(selected_rows)          
initial_centroids = result_values.unique()
print(f"initial_centroids_array:\n{initial_centroids}")
print(f"n_centroids: {len(result_values.unique())}")

array = custom_init(X, n_clusters)
display(array)
print(f"array_shape: {array.shape}\n")

max_rank: 38704
n_clusters: 100
start: 192
step: 387


Unnamed: 0,user_key,hashvalue,rank,kmeans
1495,239f7feda1603de505bbe47f27edffa2,360570492487,192.0,1
13497,25ed82f07c0a30eb5e7e7d80d00ffbb0,814830520176,579.0,2
14241,3048677ceae253cd77df9541273a231a,1194427668295,966.0,5
15060,0f2ef2202fe4b011f263a554213c7770,1454683633478,1353.0,7
16721,0a8c73f1692c3b60dbf832952669b0dd,1675904411589,1740.0,9
...,...,...,...,...
97285,42734cbc6eb7cd0ee04510e8e4fb2bf4,38459719173970,37344.0,193
97286,0938631f7b1776f2f6c60daf862bd11d,38459719173970,37344.0,193
98486,57f1b3122cb2756111c64721daa95ad6,38707210438231,37731.0,195
99124,49a32a85bb23f526493b7ba871f516fd,39084561276486,38118.0,197


initial_centroids:
[  360570492487   814830520176  1194427668295  1454683633478
  1675904411589  1783370843073  1916313438034  2150171129667
  2808769026928  3288798204738  3670983268196  3943334626947
  4126048148806  4388679040840  4960249059174  5354279138636
  5588309218131  5857687795807  6081703531911  6170545054090
  6331283191142  6598003152502  7147462022454  7270161588000
  7611548898118  7802704590662  8058292351809  8281081722352
  8362602156292  8506541801268  8758469086876  9455213937266
 10139007919993 10619976107878 11546863010418 12095561909984
 12748396566370 13229434213750 13815905682210 14336620122531
 14877926425364 15075745591126 15594883648586 15995321127842
 16203136347700 16555136476598 16758780335682 17154937087319
 17542544605988 18710825750754 19055166859210 19313959540422
 19438899061506 19622110918488 20433167131346 21084273837392
 21496471729091 21649754531718 21913748207362 22577426683754
 23169561550406 23444976049862 23691940328240 23809011371590
 2389

array([[  360570492487],
       [  814830520176],
       [ 1194427668295],
       [ 1454683633478],
       [ 1675904411589],
       [ 1783370843073],
       [ 1916313438034],
       [ 2150171129667],
       [ 2808769026928],
       [ 3288798204738],
       [ 3670983268196],
       [ 3943334626947],
       [ 4126048148806],
       [ 4388679040840],
       [ 4960249059174],
       [ 5354279138636],
       [ 5588309218131],
       [ 5857687795807],
       [ 6081703531911],
       [ 6170545054090],
       [ 6331283191142],
       [ 6598003152502],
       [ 7147462022454],
       [ 7270161588000],
       [ 7611548898118],
       [ 7802704590662],
       [ 8058292351809],
       [ 8281081722352],
       [ 8362602156292],
       [ 8506541801268],
       [ 8758469086876],
       [ 9455213937266],
       [10139007919993],
       [10619976107878],
       [11546863010418],
       [12095561909984],
       [12748396566370],
       [13229434213750],
       [13815905682210],
       [14336620122531],


array_shape: (100, 1)

max_rank: 3293033
n_clusters: 1000
start: 1645
step: 3293


Unnamed: 0,user_key,hashvalue,rank
16546,2b5619ae56dc30c25012ef4a1fb6593b,1659486876355,1645.0
25021,062152c3fd8d4745afa764bc8572880b,4201348809598,4938.0
35576,010f75260d7f4312b189bb19db413a8a,6505019766023,8231.0
46678,0ca8fcea5b2fd9a859337391915b913b,8558687255933,11524.0
53306,10a4919ca6ad0569cd6405416c2f3679,13773550421666,14817.0
...,...,...,...
9975661,0b8cd6b55452d2c69a2b745478068061,1124070580824259,3281473.0
9975662,2b73a977b00221e61bbbbbacea56cf18,1124070580824259,3281473.0
9988359,4cf40a2183080f6ad46b761672ae3245,1124414846943235,3284766.0
9993628,012aca2f8b273e2f2d018abccd412d9a,1124892489300179,3288059.0


initial_centroids_array:
[   1659486876355    4201348809598    6505019766023    8558687255933
   13773550421666   16965622419014   21014027388740   23829412734737
   26287853792670   32608275035906   36846306636390   38847806707396
   41360990538999   42881656704519   45081984695269   48766563538439
   51316867426866   52717395712081   56838450554706   59087336202100
   61112874071902   65559304552690   68898642923378   71130306307658
   72544543244100   74353616378706   75828476787011   76535199438390
   77235141091076   77996902257474   78621316744010   78879928760390
   80343594374407   82206599767330   84125384495874   85246796899253
   86168614222662   86707119369936   87657849482775   90749830497008
   93542532667971   94396224264792   96070342496303   96718103082955
  101025585771118  103020556115281  104853868034631  106704556478449
  107874836809334  108543351443297  109241308628039  109908639204026
  111223371724785  111797875790662  112384262549205  112799593005650
  1129501

array([[   1659486876355],
       [   4201348809598],
       [   6505019766023],
       [   8558687255933],
       [  13773550421666],
       [  16965622419014],
       [  21014027388740],
       [  23829412734737],
       [  26287853792670],
       [  32608275035906],
       [  36846306636390],
       [  38847806707396],
       [  41360990538999],
       [  42881656704519],
       [  45081984695269],
       [  48766563538439],
       [  51316867426866],
       [  52717395712081],
       [  56838450554706],
       [  59087336202100],
       [  61112874071902],
       [  65559304552690],
       [  68898642923378],
       [  71130306307658],
       [  72544543244100],
       [  74353616378706],
       [  75828476787011],
       [  76535199438390],
       [  77235141091076],
       [  77996902257474],
       [  78621316744010],
       [  78879928760390],
       [  80343594374407],
       [  82206599767330],
       [  84125384495874],
       [  85246796899253],
       [  86168614222662],
 

array_shape: (1000, 1)



In [None]:
from sklearn.cluster import MiniBatchKMeans

X_test = df_test
print(f"X_shape: {X_test.shape}")
n_clusters_test = int(len(X_test['hashvalue'])/k[1])
minibatch_kmeans_test = MiniBatchKMeans(n_clusters=n_clusters_test, init=custom_init(X_test, n_clusters_test, random_state=True), n_init=1)
# minibatch_kmeans_test = MiniBatchKMeans(n_clusters=n_clusters_test, init=array_test, n_init=1)
print(f"X_shape: {X_test[['hashvalue']].shape}")
minibatch_kmeans_test.fit(X_test[['hashvalue']])
clusters_test = minibatch_kmeans_test.labels_

df_test[f'kmeans_{n_clusters_test}'] = clusters_test
display(df_test)

X_shape: (10000, 4)
max_rank: 286
start: 13
step: 28
n_centroids: 10
X_shape: (10000, 1)


Unnamed: 0,user_key,hashvalue,rank,kmeans_10
0,1bc820ac5e295c68d696a32da57b65e8,635656006,1.0,0
1,36053bc85af28f528e35f5c72b4c4186,1944196966,2.0,0
2,1c1fbe6ab4b02d34b3275c10f812c3c6,1944196966,2.0,0
3,15dc58cd35a16cedac13fb0dbbddd296,3055813318,3.0,0
4,07477b17b5d618909bcd65fb10e8105c,3080390514,4.0,0
...,...,...,...,...
9995,46ffdd1913b7f33c82c36830f9540d6e,535424903000,286.0,5
9996,56e034d8731620203085ecd7e0be0713,535424903000,286.0,5
9997,03359547f749ffc0904e5e33e4dbff95,535424903000,286.0,5
9998,401f21cf67664bd25e4a870cdaf0103e,535424903000,286.0,5


In [None]:
counts_test = df_test['kmeans_10'].value_counts()
print(f"number_of_clusters: {df_test['kmeans_10'].nunique()}")
print(f"counts_per_cluster:\n{counts_test}")
print(f"min_value: {min(counts_test.values)}")
print(f"max_value: {np.max(counts_test.values)}")
print(f"total_counts: {sum(counts_test.values)}")

number_of_clusters: 7
counts_per_cluster:
3    8318
5    1006
0     269
6     234
9     106
2      59
1       8
Name: kmeans_10, dtype: int64
min_value: 8
max_value: 8318
total_counts: 10000


In [None]:
from sklearn.cluster import MiniBatchKMeans

X = df
n_clusters = int(len(X['hashvalue'])/k[2])
minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, init=custom_init(X, n_clusters, random_state=True), n_init=1)
# minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, init=array, n_init=1)
print(f"X_shape: {X[['hashvalue']].shape}")
minibatch_kmeans.fit(X[['hashvalue']])
clusters = minibatch_kmeans.labels_

df[f'kmeans_{n_clusters}'] = clusters
display(df)

max_rank: 3293033
start: 1645
step: 3293
n_centroids: 1000
X_shape: (10000000, 1)


Unnamed: 0,user_key,hashvalue,rank,kmeans_1000
0,1bc820ac5e295c68d696a32da57b65e8,635656006,1.0,769
1,36053bc85af28f528e35f5c72b4c4186,1944196966,2.0,769
2,1c1fbe6ab4b02d34b3275c10f812c3c6,1944196966,2.0,769
3,15dc58cd35a16cedac13fb0dbbddd296,3055813318,3.0,769
4,07477b17b5d618909bcd65fb10e8105c,3080390514,4.0,769
...,...,...,...,...
9999995,458e620b2209f2961fa2fa4075fe4081,1125893967321395,3293029.0,771
9999996,10cc208ab9fd2c6afe69428da8309728,1125894093296674,3293030.0,771
9999997,062888e953e7fd8dbb24074cbed31269,1125894696294579,3293031.0,771
9999998,281dd96652f4b1c2190c5632cce25cc8,1125895437562999,3293032.0,771


In [None]:
counts = df['kmeans_1000'].value_counts()
print(f"number_of_clusters: {df['kmeans_1000'].nunique()}")
print(f"counts_per_cluster:\n{counts}")
print(f"min_value: {min(counts.values)}")
print(f"max_value: {np.max(counts.values)}")
print(f"total_counts: {sum(counts.values)}")

number_of_clusters: 995
counts_per_cluster:
15     524503
97     186996
149    158026
292    122340
154    116136
        ...  
251        33
880        30
473        27
143        10
85          1
Name: kmeans_1000, Length: 995, dtype: int64
min_value: 1
max_value: 524503
total_counts: 10000000


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=41af8bd7-a5ed-4334-a2fe-992dcc7ea742' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>