-
Notifications
You must be signed in to change notification settings - Fork 1
/
Canopy.py
136 lines (123 loc) · 4.46 KB
/
Canopy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@time: 2018/07/17 17:39
@author: https://github.com/AlanConstantine/CanopyByPython/blob/master/canopy.py
@file: Canopy.py
@software:diagramalgorithm
@note: Canopy粗聚类,用于前期找K值范围
"""
import math
import random
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
# 随机生成500个二维[0,1)平面点
dataset = np.random.rand(500, 2)
class Canopy:
def __init__(self, dataset):
self.dataset = dataset
self.t1 = 0
self.t2 = 0
# 设置初始阈值
def setThreshold(self, t1, t2):
if t1 > t2:
self.t1 = t1
self.t2 = t2
else:
print('t1 needs to be larger than t2!')
# 使用欧式距离进行距离的计算
def euclideanDistance(self, vec1, vec2):
return math.sqrt(((vec1 - vec2)**2).sum())
# 根据当前dataset的长度随机选择一个下标
def getRandIndex(self):
return random.randint(0, len(self.dataset) - 1)
# 聚类核心代码
def clustering(self):
if self.t1 == 0:
print('Please set the threshold.')
else:
canopies = [] # 用于存放最终归类结果
# while len(self.dataset) != 0:
# 20180324修改
while len(self.dataset) > 1:
rand_index = self.getRandIndex()
current_center = self.dataset[rand_index] # 随机获取一个中心点,定为P点
current_center_list = [] # 初始化P点的canopy类容器
delete_list = [] # 初始化P点的删除容器
self.dataset = np.delete(self.dataset, rand_index,
0) # 删除随机选择的中心点P
for datum_j in range(len(self.dataset)):
datum = self.dataset[datum_j]
distance = self.euclideanDistance(
current_center, datum) # 计算选取的中心点P到每个点之间的距离
if distance < self.t1:
# 若距离小于t1,则将点归入P点的canopy类
current_center_list.append(datum)
if distance < self.t2:
delete_list.append(datum_j) # 若小于t2则归入删除容器
# 根据删除容器的下标,将元素从数据集中删除
self.dataset = np.delete(self.dataset, delete_list, 0)
canopies.append((current_center, current_center_list))
return canopies
# 画图使用
def showCanopy(canopies, dataset, t1, t2):
fig = plt.figure()
sc = fig.add_subplot(111)
colors = [
'brown', 'green', 'blue', 'y', 'r', 'tan', 'dodgerblue', 'deeppink',
'orangered', 'peru', 'blue', 'y', 'r', 'gold', 'dimgray', 'darkorange',
'peru', 'blue', 'y', 'r', 'cyan', 'tan', 'orchid', 'peru', 'blue', 'y',
'r', 'sienna'
]
markers = [
'*', 'h', 'H', '+', 'o', '1', '2', '3', ',', 'v', 'H', '+', '1', '2',
'^', '<', '>', '.', '4', 'H', '+', '1', '2', 's', 'p', 'x', 'D', 'd',
'|', '_'
]
for i in range(len(canopies)):
canopy = canopies[i]
center = canopy[0]
components = canopy[1]
sc.plot(
center[0],
center[1],
marker=markers[i],
color=colors[i],
markersize=10)
t1_circle = plt.Circle(
xy=(center[0], center[1]),
radius=t1,
color='dodgerblue',
fill=False)
t2_circle = plt.Circle(
xy=(center[0], center[1]), radius=t2, color='skyblue', alpha=0.2)
sc.add_artist(t1_circle)
sc.add_artist(t2_circle)
for component in components:
sc.plot(
component[0],
component[1],
marker=markers[i],
color=colors[i],
markersize=1.5)
maxvalue = np.amax(dataset)
minvalue = np.amin(dataset)
plt.xlim(minvalue - t1, maxvalue + t1)
plt.ylim(minvalue - t1, maxvalue + t1)
plt.show()
# 调用初始化使用
def main():
t1 = 0.6
t2 = 0.4
gc = Canopy(dataset)
gc.setThreshold(t1, t2)
canopies = gc.clustering()
print('Get %s initial centers.' % len(canopies))
showCanopy(canopies, dataset, t1, t2)
if __name__ == '__main__':
t_s = datetime.now()
main()
t_e = datetime.now()
usedtime = t_e - t_s
print('[%s]' % usedtime)