-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKmeans_Standalone_PhaseSpace.py
112 lines (87 loc) · 3.5 KB
/
Kmeans_Standalone_PhaseSpace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
""" Kmeans algorithm and output diagnostics in JSON """
""" From http://flothesof.github.io/k-means-numpy.html """
# Load modules
import numpy as np
import pandas as pd
""" Choose the dataset and the output directory """
folder = './'
target = '/Users/jmartinez/Sites/kmeansvisu_cargo/Data/'
#dataset = 'UnequalVar'
#dataset = 'Mixture2D'
dataset = 'AnisotropBlob'
""" Read the data and calculate length """
filename = './data/'+dataset+'.csv'
data_points = pd.read_csv(filename, sep='\t', header=None)
datacount = len(data_points)
dataplot = data_points.values
""" Modules for the Kmeans calculation """
def initialize_centroids(points, clusters):
"""returns k centroids from the initial points"""
centroids = points.copy()
np.random.shuffle(centroids)
return centroids[:clusters]
def closest_centroid(points, centroids):
"""returns an array containing the index to the nearest centroid for each point"""
# distances = np.sqrt(((points - centroids[:, np.newaxis])**2).sum(axis=2))
distances = np.abs(points - centroids[:, np.newaxis]).sum(axis=2)
return np.argmin(distances, axis=0)
def calculate_inertia(points, closest, centroids):
# distances = np.sqrt(((points - centroids[:, np.newaxis])**2).sum(axis=2))
distances = np.abs(points - centroids[:, np.newaxis]).sum(axis=2)
return distances
def move_centroids(points, closest, centroids):
"""returns the new centroids assigned from the points closest to them"""
return np.array([points[closest==k].mean(axis=0) for k in range(centroids.shape[0])])
""" Parameter definition """
samples=100
iterations=40
clusters=3
""" Storage of the variables """
cargo_centroid=[]
cargo_inertia=[]
supercargo_inertia=[]
cargo_clustersize=[]
cargo_shifts=[]
supercargo_shifts=[]
phase_x=[]
phase_y=[]
sphase_x=[]
sphase_y=[]
""" Loop for many initializations, or samples, over Kmeans """
for s in range(0,samples):
""" Initialize the random centroids """
c = initialize_centroids(dataplot,clusters)
for i in range(0,iterations):
""" Move the centroids """
mov = move_centroids(dataplot, closest_centroid(dataplot,c), c)
c = mov
cargo_centroid.append(c)
""" Calculate the sum of distances within each cluster """
inertia=calculate_inertia(dataplot, closest_centroid(dataplot,c), c)
for n in range(0,len(inertia)):
cargo_inertia.append(np.sum(inertia[n]))
supercargo_inertia.append(cargo_inertia)
cargo_inertia=[]
""" Count the number of datapoints in each cluster """
groups=closest_centroid(dataplot,c)
for m in range(0,clusters):
cargo_shifts.append(np.sum(np.count_nonzero(groups==m)))
supercargo_shifts.append(cargo_shifts)
cargo_shifts=[]
""" Pairing the values for the phase space """
for v in range(0,clusters):
for j in range(0,iterations):
phase_x.append(supercargo_shifts[j][v])
phase_y.append(supercargo_inertia[j][v])
sphase_x.append(phase_x)
sphase_y.append(phase_y)
phase_x=[]
phase_y=[]
""" Save output in JSON format via Pandas """
phase_pd=pd.DataFrame({"phase_x":sphase_x[v],"phase_y":sphase_y[v]})
phase_pd.reset_index().to_json(orient='records',path_or_buf=target+dataset+'_PhaseSpace_'+str(s)+'_'+str(v)+'.json')
sphase_x=[]
sphase_y=[]
supercargo_shifts=[]
supercargo_inertia=[]