/
clustring.py
118 lines (101 loc) · 3.41 KB
/
clustring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sys
from math import log, sqrt
from itertools import combinations
def cosine_distance(a, b):
cos = 0.0
a_tfidf = a["tfidf"]
for token, tfidf in b["tfidf"].iteritems():
if token in a_tfidf:
cos += tfidf * a_tfidf[token]
return cos
def normalize(features):
norm = 1.0 / sqrt(sum(i**2 for i in features.itervalues()))
for k, v in features.iteritems():
features[k] = v * norm
return features
def add_tfidf_to(documents):
tokens = {}
for id, doc in enumerate(documents):
tf = {}
doc["tfidf"] = {}
doc_tokens = doc.get("tokens", [])
for token in doc_tokens:
tf[token] = tf.get(token, 0) + 1
num_tokens = len(doc_tokens)
if num_tokens > 0:
for token, freq in tf.iteritems():
tokens.setdefault(token, []).append((id, float(freq) / num_tokens))
doc_count = float(len(documents))
for token, docs in tokens.iteritems():
idf = log(doc_count / len(docs))
for id, tf in docs:
tfidf = tf * idf
if tfidf > 0:
documents[id]["tfidf"][token] = tfidf
for doc in documents:
doc["tfidf"] = normalize(doc["tfidf"])
def choose_cluster(node, cluster_lookup, edges):
new = cluster_lookup[node]
if node in edges:
seen, num_seen = {}, {}
for target, weight in edges.get(node, []):
seen[cluster_lookup[target]] = seen.get(
cluster_lookup[target], 0.0) + weight
for k, v in seen.iteritems():
num_seen.setdefault(v, []).append(k)
new = num_seen[max(num_seen)][0]
return new
def majorclust(graph):
cluster_lookup = dict((node, i) for i, node in enumerate(graph.nodes))
count = 0
movements = set()
finished = False
while not finished:
finished = True
for node in graph.nodes:
new = choose_cluster(node, cluster_lookup, graph.edges)
move = (node, cluster_lookup[node], new)
if new != cluster_lookup[node] and move not in movements:
movements.add(move)
cluster_lookup[node] = new
finished = False
clusters = {}
for k, v in cluster_lookup.iteritems():
clusters.setdefault(v, []).append(k)
return clusters.values()
def get_distance_graph(documents):
class Graph(object):
def __init__(self):
self.edges = {}
def add_edge(self, n1, n2, w):
self.edges.setdefault(n1, []).append((n2, w))
self.edges.setdefault(n2, []).append((n1, w))
graph = Graph()
doc_ids = range(len(documents))
graph.nodes = set(doc_ids)
for a, b in combinations(doc_ids, 2):
graph.add_edge(a, b, cosine_distance(documents[a], documents[b]))
return graph
def get_documents():
texts = [
"foo blub baz",
"foo bar baz",
"asdf bsdf csdf",
"foo bab blub",
"csdf hddf kjtz",
"123 456 890",
"321 890 456 foo",
"123 890 uiop",
]
return [{"text": text, "tokens": text.split()}
for i, text in enumerate(texts)]
def main(args):
documents = get_documents()
add_tfidf_to(documents)
dist_graph = get_distance_graph(documents)
for cluster in majorclust(dist_graph):
print "========="
for doc_id in cluster:
print documents[doc_id]["text"]
if __name__ == '__main__':
main(sys.argv)