In [54]:
#top 5k dice keywords
NUM_CLUSTERS         = 100 
NUM_CLUSTER_SYNONYMS = 5
# number of cluster synonyms to map to
ROOT_FOLDER = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"

KEY_WORDS_FILE       = None
SYNONYMS_QRY_FILE    = None
SYNONYMS_INDEX_FILE  = None
PHRASES_FILE         = "%s/Phrases.txt" % ROOT_FOLDER
MODEL_FILE           = "%s/model.w2v" % ROOT_FOLDER

AP_CLUSTERS_FILE = "%s/Ap_Clusters.txt" % ROOT_FOLDER
KM_CLUSTERS_FILE = "%s/KMeans_Clusters.txt" % ROOT_FOLDER

In [25]:
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

In [26]:
import time
grand_start = time.time()

In [27]:
import numpy as np
from collections import defaultdict

#functions
def is_valid_search_keyword(kw):
    q_kw = " " + kw + " "
    for wd in "(,), and , or , not , true , TRUE , false , FALSE ".split(","):
        if wd in q_kw:
            return False
    # remove queries with negations in them
    tokens = kw.split(" ")
    
    # remove single char keywords
    if len(tokens) == 1 and len(tokens[0]) == 1:
        return False
    
    if any(map(lambda t: t.strip().startswith("-"), tokens)):
        return False
    return True

def map_keyword(kw):
    return kw.replace(" ", "_")

def get_vector(item, model):
    if item not in model.vocab:
        return None
    vocab = model.vocab[item]
    vector = model.syn0[vocab.index]
    return vector

def get_norm_vector(item, model):
    if item not in model.vocab:
        return None
    # for deserialized models, the norm vectors are not stored
    vec = get_vector(item, model)
    norm = np.linalg.norm(vec)
    if norm != 0:
        return vec / norm
    return vec

def extract_clusters(ids, id2kwd):
    clusters = defaultdict(set)
    for kw_id, label in enumerate(ids):
        kw = id2kwd[kw_id]
        clusters[label].add(kw)
    return clusters

def extract_centroids(km_clusterer):
    lbl2centroid = dict()
    for i in range(len(km_clusterer.cluster_centers_)):
        centroid = km_clusterer.cluster_centers_[i]
        c_norm = np.linalg.norm(centroid)
        if c_norm > 0.0:
            n_centroid = centroid / c_norm
        else:
            n_centroid = centroid
        lbl2centroid[i] = n_centroid
    return lbl2centroid

def compute_cluster_similarities(kwds, kwd2id, vectors, lbl2centroid):
    kwd2cluster_sims = dict()
    for kwd in kwds:
        ix = kwd2id[kwd]
        nvec = vectors[ix]
        sims = []

        for lbl, centroid in lbl2centroid.items():
            cosine_sim = np.inner(nvec, centroid)
            sims.append((lbl,cosine_sim))
        sims = sorted(sims, key = lambda (lbl,sim): -sim)
        kwd2cluster_sims[kwd] = sims
        if len(kwd2cluster_sims) % 1000 == 0:
            print("%i computed out of %i" % (len(kwd2cluster_sims), len(all_kwds)))
    return kwd2cluster_sims

# expand at query time
# use with tfidf (on cluster labels) at index time by just mapping to cluster label
def write_most_similar_clusters(topn, kwd2cluster_sims, synonym_qry_fname, synonyn_index_fname):
    kwords = sorted(kwd2cluster_sims.keys())
    cluster_label = lambda lbl: "cluster_" + str(lbl)
    
    with open(synonym_qry_fname, "w+") as qry_f:
        for kword in kwords:
            cl_sims = kwd2cluster_sims[kword]
            # unlike the other methods, we DO want to include the first cluster here
            # as it's a cluster rather than the top 10 or top 30 keyword method
            top_clusters = cl_sims[:topn]                
            if len(top_clusters) > 0:
                qry_f.write("%s=>" % kword)
                for lbl, sim in top_clusters:                    
                    qry_f.write("%s|%f " %(cluster_label(lbl),sim))
                qry_f.write("\n")
                
    with open(synonyn_index_fname, "w+") as f:
        for kword in kwords:
            # get top cluster label
            lbl, sim = kwd2cluster_sims[kword][0]
            f.write("%s=>%s\n" % (kword, cluster_label(lbl)))

In [28]:
import gensim, time
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(MODEL_FILE)

In [29]:
phrases = load_stop_words(PHRASES_FILE)
len(phrases)

1194

In [30]:
keywords = []
un_keywords = set()
if KEY_WORDS_FILE is not None:
    with open(KEY_WORDS_FILE) as f:
        for line in f:
            kw = line.strip()
            if len(kw) > 0 and is_valid_search_keyword(kw):
                keywords.append(kw)
print("%i keywords loaded from %s" % (len(keywords), KEY_WORDS_FILE))

0 keywords loaded from None


In [31]:
#get all keywords
# remove any not in the model
all_kwds = phrases.union(keywords)
#all_kwds = set(keywords)
for kwd in list(all_kwds):
    if kwd not in model.vocab:
        all_kwds.remove(kwd)
    splt = kwd.split(" ")
    # add in single word tokens from keywords
    if splt and len(splt) > 1:
        for wd in splt:
            if wd.strip() and wd in model.vocab:
                all_kwds.add(wd)

id2kwd = dict()
kwd2id = dict()
vectors = []
for term in all_kwds:
    id2kwd[len(vectors)] = term
    kwd2id[term] = len(vectors)
    vec = get_norm_vector(term, model)
    vectors.append(vec)

len(all_kwds), len(vectors)

(1176, 1176)

# Cluster the Vectors

## Affinity Propagation

In [32]:
from sklearn import cluster
from sklearn.cluster import AffinityPropagation
import time
start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
print("Clustering vectors into clusters via AP")
ap_clusterer = AffinityPropagation()
ap_ids = ap_clusterer.fit_predict(vectors)

end = time.time()
print("Creating %i clusters took %i seconds" % (len(set(ap_ids)), end - start))

Clustering vectors into clusters via AP
Creating 82 clusters took 0 seconds


In [33]:
ap_lbl2cluster = extract_clusters(ap_ids, id2kwd)
ap_lbl2centroid = extract_centroids(ap_clusterer)

len(ap_lbl2cluster), len(ap_lbl2centroid)

(82, 82)

## Examine the AP Clusters

In [41]:
ap_lbl2cluster.values()

[{'affect',
  'affects',
  'another',
  'another reason',
  'certain',
  'effect',
  'effects',
  'example',
  'final',
  'finally',
  'main reason',
  'major',
  'negative',
  'one',
  'one way',
  'stressors',
  'unbalanced',
  'weather'},
 {'central',
  'climate',
  'eastern pacific regions',
  'equatorial',
  'especially',
  'greatly',
  'int',
  'major shifts',
  'movement',
  'pacific regions',
  'swell',
  'warm water eastward',
  'water levels',
  'worlds'},
 {'caused',
  'environment',
  'health',
  'increased stress',
  'most dangerous threat',
  'most dangerous threats',
  'put stress',
  'those',
  'threats'},
 {'coral polyps range',
  'diameter',
  'foot',
  'pinhead',
  'range',
  'size',
  'tiny',
  'tiny up',
  'up'},
 {'10of',
  '3of',
  '5of',
  'above',
  'even',
  'increase over',
  'normal',
  'ocean water temperatures increase',
  'places',
  'say',
  'waters shift eastward ocean water'},
 {'big problem',
  'bleaching',
  'conclusion coral',
  'coral bleach',
  'c

In [42]:
from sklearn import cluster
from sklearn.cluster import KMeans
import time
start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
print("Clustering vectors into %i clusters" % NUM_CLUSTERS)
km_clusterer = KMeans(n_clusters=NUM_CLUSTERS, n_jobs=1, verbose=1, n_init=5)
ids = km_clusterer.fit_predict(vectors)

end = time.time()
print("Creating %i clusters took %i seconds" % (NUM_CLUSTERS, end - start))

Clustering vectors into 100 clusters
Initialization complete
Iteration  0, inertia 337.257
Iteration  1, inertia 237.088
Iteration  2, inertia 231.604
Iteration  3, inertia 229.067
Iteration  4, inertia 227.822
Iteration  5, inertia 227.560
Iteration  6, inertia 227.396
Iteration  7, inertia 227.321
Converged at iteration 7
Initialization complete
Iteration  0, inertia 334.312
Iteration  1, inertia 235.941
Iteration  2, inertia 230.795
Iteration  3, inertia 229.187
Iteration  4, inertia 228.479
Iteration  5, inertia 228.339
Iteration  6, inertia 228.273
Iteration  7, inertia 228.227
Iteration  8, inertia 228.191
Converged at iteration 8
Initialization complete
Iteration  0, inertia 340.482
Iteration  1, inertia 238.389
Iteration  2, inertia 233.003
Iteration  3, inertia 231.386
Iteration  4, inertia 230.706
Iteration  5, inertia 230.220
Iteration  6, inertia 230.152
Converged at iteration 6
Initialization complete
Iteration  0, inertia 340.290
Iteration  1, inertia 238.158
Iteration  2

In [43]:
lbl2cluster = extract_clusters(ids, id2kwd)
lbl2centroid = extract_centroids(km_clusterer)

len(lbl2cluster), len(lbl2centroid)

(100, 100)

## Examine the k-Means Clusters

In [50]:
lbl2cluster.values()

[{'believe',
  'get bleached',
  'many different',
  'reasons',
  'two',
  'various',
  'ways'},
 {'back',
  'deeper colder',
  'deeper colder waters',
  'drag warm surface waters westward',
  'eastern pacific rise',
  'pacific',
  'surface'},
 {'big problem',
  'bleaching',
  'conclusion coral',
  'coral bleach',
  'coral reef',
  'loss',
  'population',
  'why coral'},
 {'algae', 'coral', 'coral gets', 'corals', 'off', 'produces'},
 {'alive',
  'carbon dioxide',
  'carbon dioxide decreases',
  'co2 decreases',
  'co2 levels',
  'healthy',
  'keep coral',
  'keeps',
  'needed',
  'temperature increases',
  'water increases'},
 {'algae living',
  'basically',
  'colors',
  'give',
  'gives',
  'giving',
  'inside',
  'lives',
  'own',
  'tissues',
  'type',
  'wich'},
 {'alter',
  'changed',
  'couple',
  'depending',
  'direction',
  'drastically',
  'every',
  'normally',
  'shift',
  'shifting',
  'shifting trade winds',
  'temperatures',
  'trade winds change',
  'trade winds shift

# Dump Clusters to File

In [65]:
def clusters_to_file(fname, lbl2Clusters):
    with open(fname, "w+") as f:
        for lbl, phrases in lbl2Clusters.items():
            f.write(str(lbl).rjust(4,' '))
            f.write("|")
            f.write(str(len(phrases)).rjust(5,' '))
            f.write("|")
            line = ",".join(sorted(phrases, key = lambda s: (len(s.split(" ")), s)))
            f.write(line)
            f.write("\n")

In [66]:
clusters_to_file(AP_CLUSTERS_FILE, ap_lbl2cluster)
clusters_to_file(KM_CLUSTERS_FILE, lbl2cluster)

# Conceptual Search Output Files

In [44]:
import time
start = time.time()

kwd2cluster_sims = compute_cluster_similarities(all_kwds, kwd2id, vectors, lbl2centroid)
end = time.time()
print("Sorting the clusters for each of the %i keywords took %i seconds" % (len(all_kwds),end - start))

1000 computed out of 1176
Sorting the clusters for each of the 1176 keywords took 0 seconds


In [45]:
if SYNONYMS_QRY_FILE is not None and SYNONYMS_INDEX_FILE is not None:
    write_most_similar_clusters(NUM_CLUSTER_SYNONYMS, kwd2cluster_sims, SYNONYMS_QRY_FILE, SYNONYMS_INDEX_FILE)

In [46]:
grand_end = time.time()
print("Cluster generation and processing took %i seconds" % (grand_end - grand_start))

Cluster generation and processing took 13 seconds
