# Let's Load the Full Google News Model

In [59]:
ROOT_FOLDER  = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"
PHRASES_FILE = "%s/Phrases.txt" % ROOT_FOLDER
CLUSTER_SYN_FILE = "%s/word2vec_temp_syns.txt" % ROOT_FOLDER
MODEL_FILE   = "/Users/simon.hughes/Documents/Dice Data/Word2Vec/GoogleNews-vectors-negative300.bin"

In [89]:
from collections import defaultdict

class SynonymMapper(object):
    def __init__(self, mapper, nested, case_sensitive=False):
        self.case_sensitive = case_sensitive
        self.mapper = mapper
        self.nested = nested
        self.synonyms = set()
        for rhs in self.mapper.values():
            for syn in rhs:
                self.synonyms.add(syn)
        
    def is_synonym(self, term):
        return term in self.synonyms
        
    def map_synonyms(self, tokens, debug=False):
        mapped = []
        size = len(tokens)
        if not self.case_sensitive:
            tmp_tokens = map(lambda s: s.lower(), tokens)
        else:
            tmp_tokens = tokens
        ix = 0
        while ix < size:
            if debug:
                print "ix", ix
            best, best_key = None, None
            tmp_ix = ix        
            max_ix = ix
            current = ""
            d = self.nested
            while tmp_ix < size and tmp_tokens[tmp_ix] in d:
                current += tmp_tokens[tmp_ix] + " "
                key = current.strip()
                if key in self.mapper:
                    if debug:
                        if best is not None:
                            print(ix, tmp_ix, "new best:", key, "=>", self.mapper[key])
                        else:
                            print(ix, tmp_ix, "best:", key, "=>", self.mapper[key])
                    best = self.mapper[key]
                    best_key = key
                    max_ix = tmp_ix                    
                d = d[tmp_tokens[tmp_ix]]
                tmp_ix += 1
            if not best:
                #retain original casing
                mapped.append(tokens[ix])
            else:
                ix = max_ix
                #yields a set
                for item in sorted(best):
                    mapped.append(item)
            ix += 1
        return mapped

    def __repr__(self):
        return "Synonym Mapper: %i synonyms mapped" % len(self.mapper)

def build_synonym_filter(files, case_sensitive=False):
    # recursively define a defaultdict generator
    mapper = defaultdict(set)
    def dd():
        return defaultdict(dd)
    nested_map = defaultdict(dd)
    file_locn = dict()
    if type(files) == str:
        files = [files]
    for f in files:
        with open(f, "r+") as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 0 and not line[0] == "#":
                    if "=>" in line:
                        left, right = line.split("=>")
                        right = set(right.split(","))
                        left_parts = left.split(",")
                    else:
                        left_parts = line.split(",")
                        right = set(left_parts)

                    for syn in left_parts:
                        for rhs in right:
                            mapper[syn].add(rhs)
                        file_locn[syn] = f

                        tokens = syn.split(" ")
                        prev = tokens[0]
                        d = nested_map[prev]
                        for token in tokens[1:]:
                            d = d[token]
                            prev = token                        
    return SynonymMapper(mapper, nested_map, case_sensitive)

#String processing
def white_space_tokenize(s):
    return s.split(" ")

__punct__ = set(".?!,;:")
def remove_punct_at_end(s):
    while len(s) > 1 and s[-1] in __punct__:
        s = s[:-1]
    return s

#Token Filters
def fact_len_filter(max_len):
    def len_filter(tokens):
        return filter(lambda s: len(s) >= max_len, tokens)
    return len_filter

remove_empty_tokens_filter = fact_len_filter(1)

def lower_case_filter(tokens):
    if type(tokens) == str:
        return tokens.lower()
    return map(lambda t: t.lower(), tokens)

__punct__ = set(".?!,;:")

def remove_punct_at_end_filter(tokens):
    return map(remove_punct_at_end, tokens)

def fact_is_synonym_filter(syn_mapper):
    def is_synonym_filter(tokens):
        return filter(syn_mapper.is_synonym, tokens)
    return is_synonym_filter

def is_cluster_filter(tokens):
    return filter(lambda s: s.startswith("cluster_"), tokens)

def remove_cluster_filter(tokens):
    return map(lambda s: s.replace("cluster_",""), tokens)

def analyze(s, filters):
    temp = s
    for f in filters:
        temp = f(temp)
    return temp

In [2]:
from gensim.models.word2vec import Word2Vec

#takes a while to load
full_model = Word2Vec.load_word2vec_format(MODEL_FILE, binary=True)

In [3]:
full_model.most_similar("whitening")

[(u'teeth_whitening', 0.7431495189666748),
 (u'tooth_whitening', 0.7134369611740112),
 (u'Teeth_whitening', 0.6533203125),
 (u'microdermabrasion', 0.6517258882522583),
 (u'Whitening', 0.6478936672210693),
 (u'tooth_bleaching', 0.6321505904197693),
 (u'skin_lightening', 0.6302615404129028),
 (u'whiten', 0.6267747282981873),
 (u'whiten_teeth', 0.6228857040405273),
 (u'whitening_strips', 0.6214999556541443)]

In [38]:
full_model.most_similar("coral_bleaching", topn=20)

[(u'Coral_bleaching', 0.6434895992279053),
 (u'coral_reefs', 0.6368824243545532),
 (u'bleaching', 0.6276558637619019),
 (u'algal_blooms', 0.6163963079452515),
 (u'harmful_algal_blooms', 0.5943963527679443),
 (u'harmful_algal_bloom', 0.5877514481544495),
 (u'coral_spawning', 0.5838446617126465),
 (u'corals', 0.5820288062095642),
 (u'algal_bloom', 0.5637736916542053),
 (u'extinctions', 0.5617671012878418),
 (u'reefs', 0.5594044327735901),
 (u'mass_extinctions', 0.5590829253196716),
 (u'Ocean_acidification', 0.551794171333313),
 (u'acidification', 0.5517348051071167),
 (u'amphibian_declines', 0.5485835075378418),
 (u'coral_reef_habitats', 0.547818124294281),
 (u'Harmful_algal_blooms', 0.5466654300689697),
 (u'toxic_algae_blooms', 0.5465958118438721),
 (u'ocean_acidity', 0.5426976680755615),
 (u'coral', 0.5412527918815613)]

## Cluster Our Common Terms and Phrases

In [30]:

def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

phrases = load_stop_words(PHRASES_FILE)

In [37]:
for p in phrases:
    if " " in p:
        p = p.replace(" ", "_")
        if p in full_model.vocab or p[0].upper() + p[1:] in full_model.vocab:
            print p

marine_biologist
coral_reef
delicate_balance
tropical_storms
shallow_waters
bleaching_coral
coral_bleaching
nutrient_rich
symbiotic_relationship
coral_polyps
stinging_tentacles
coral_reefs
climate_change
marine_biologists
bleached_coral
coral_polyp
negatively_affect
ocean_salinity
invertebrate_animals
carbon_dioxide


In [36]:
total  = 0
in_mdl = 0
for p in phrases:
    if " " not in p:
        total += 1
        if p in full_model.vocab:
            in_mdl += 1
total, in_mdl

(691, 680)

In [44]:
lc2key = dict()
for key in full_model.vocab.keys():
    lckey = key.lower()
    if lckey not in lc2key or key > lc2key[lckey]:
        lc2key[lckey] = key
len(lc2key), len(full_model.vocab)

(2702089, 3000000)

In [52]:
import numpy as np
from collections import defaultdict

def get_vector(item, model):
    if item not in model.vocab:
        return None
    vocab = model.vocab[item]
    vector = model.syn0[vocab.index]
    norm = np.linalg.norm(vector)
    if norm <=0:
        return vector
    return vector / norm

def extract_clusters(ids, id2kwd):
    clusters = defaultdict(set)
    for kw_id, label in enumerate(ids):
        kw = id2kwd[kw_id]
        clusters[label].add(kw)
    return clusters

vectors = []
ix2vec = dict()
ix2phrase = dict()
phrase2ix = dict()
for phrase in phrases:
    key = phrase.lower().replace(" ", "_")
    if key in lc2key:
        full_key = lc2key[key]
        vec = get_vector(full_key, full_model)
        if vec is None:
            continue
        ix = len(vectors)
        ix2vec[ix] = vec
        ix2phrase[ix] = phrase
        phrase2ix[phrase] = ix
        vectors.append(vec)

In [48]:
from sklearn import cluster
from sklearn.cluster import AffinityPropagation
import time
start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
print("Clustering vectors into clusters via AP")
ap_clusterer = AffinityPropagation()
ap_ids = ap_clusterer.fit_predict(vectors)

end = time.time()
print("Creating %i clusters took %i seconds" % (len(set(ap_ids)), end - start))

Clustering vectors into clusters via AP
Creating 123 clusters took 1 seconds


In [53]:
ap_lbl2cluster = extract_clusters(ap_ids, ix2phrase)
print len(ap_lbl2cluster)

123


In [99]:
ap_lbl2cluster.items()

[(0, {'body', 'skeleton', 'skeletons'}),
 (1, {'lead', 'leading', 'leads'}),
 (2, {'completely', 'drastically', 'greatly', 'overall'}),
 (3, {'above', 'double', 'high', 'higher', 'lower'}),
 (4, {'movement', 'phenomena', 'phenomenon'}),
 (5, {'based', 'depend', 'depending', 'depends', 'relies', 'rely'}),
 (6, {'climate', 'conditions', 'enviroment', 'environment', 'nature'}),
 (7, {'plant', 'plants'}),
 (8,
  {'contains',
   'covers',
   'gives',
   'helps',
   'produces',
   'provide',
   'provided',
   'provides'}),
 (9,
  {'affect',
   'affected',
   'affecting',
   'affects',
   'effected',
   'hurt',
   'include',
   'negatively',
   'negatively affect',
   'occur',
   'weaken'}),
 (10, {'die', 'dies', 'dying', 'kill', 'sick', 'starvation', 'starve'}),
 (11, {'large', 'size', 'small', 'tiny'}),
 (12, {'drag', 'turn', 'turning', 'turns'}),
 (13, {'play', 'plays', 'role'}),
 (14, {'forced', 'threaten', 'threatened', 'threatens'}),
 (15,
  {'alter',
   'altered',
   'change',
   'chan

In [73]:
def write_clusters(lbl2cluster, synonyn_fname):
    cluster_label = lambda lbl: "cluster_" + str(lbl)
    
    with open(synonyn_fname, "w+") as f:
        for lbl, phrases in lbl2cluster.items():
            # get top cluster label
            for phrase in sorted(phrases):
                f.write("%s=>%s\n" % (phrase, cluster_label(lbl)))

write_clusters(ap_lbl2cluster, CLUSTER_SYN_FILE)
syn_mapper = build_synonym_filter([CLUSTER_SYN_FILE], False)

In [90]:
analysis_chain = [
                  white_space_tokenize,
                  remove_punct_at_end_filter,
                  lower_case_filter,
                  syn_mapper.map_synonyms,
                  is_cluster_filter,
                  remove_cluster_filter]

In [95]:
import pandas as pd

def apply_cluster_extraction(s):
    return sorted(analyze(s, analysis_chain))

fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
data = pd.read_csv(fname, sep="|")
data = data[["Essay", "Sent Number", "Processed Sentence", "Concept Codes"]]
data["Concept Codes"] = data["Concept Codes"].astype(str).apply(lambda s: "" if s == "nan" else s)
    
#data["Concept_Codes"] = data["Concept Codes"].apply(to_concepts_only)
#del data["Concept Codes"]
data["Clusters"] = data["Processed Sentence"].apply(apply_cluster_extraction)
#data["Num_Concept_Codes"] = data["set_Concept_Codes"].apply(lambda st: len(st))

data.head(10)

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept Codes,Clusters
0,EBA1415_AEKD_4_CB_ES-05568.ann,1,What leads to differences in the rates of cora...,50,"[1, 116, 32, 61, 76]"
1,EBA1415_AEKD_4_CB_ES-05568.ann,2,Coral is often mistaken for a rock but it is m...,,"[104, 11, 21, 28, 30, 43, 46, 47]"
2,EBA1415_AEKD_4_CB_ES-05568.ann,3,Coral bleaching shows bleaching and healthy bl...,50,"[10, 117, 31, 45, 51, 52, 76, 98]"
3,EBA1415_AEKD_4_CB_ES-05568.ann,4,Coral bleaching is almost noticeable in the pa...,50,"[112, 76, 83, 92]"
4,EBA1415_AEKD_4_CB_ES-05572.ann,1,The part of coral called zooanthellae are not ...,5,"[28, 58, 62, 82]"
5,EBA1415_AEKD_4_CB_ES-05572.ann,2,And if they get or much sunlight they start to...,"5,50,_C->R,_CRel,_RRel,Causer,Result,Causer:5,...","[110, 113, 58, 64, 82, 88]"
6,EBA1415_AEKD_4_CB_ES-05572.ann,3,The reason why is because the zooanthellae if ...,4,"[114, 118, 43, 82]"
7,EBA1415_AEKD_4_CB_ES-05572.ann,4,The coral also need INFREQUENT water temperatu...,,"[121, 60, 63, 92]"
8,EBA1415_AEKD_4_CB_ES-05572.ann,5,Also its a threats for us because means that m...,11,"[105, 23, 26, 36, 53, 55, 57, 81]"
9,EBA1415_AEKD_4_CB_ES-05572.ann,6,Also the water us getting to salty,13,"[20, 82, 92]"


In [97]:
cluster_tokens = data["Clusters"].values
cluster_tokens[0:5]

array([['1', '116', '32', '61', '76'],
       ['104', '11', '21', '28', '30', '43', '46', '47'],
       ['10', '117', '31', '45', '51', '52', '76', '98'],
       ['112', '76', '83', '92'], ['28', '58', '62', '82']], dtype=object)

In [131]:
import gensim, time
from gensim.models.word2vec import Word2Vec
NEW_MODEL_FILE = "%s/word2vec_meta_model.w2v" % ROOT_FOLDER

start = time.time()

print("Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee")
model = Word2Vec(cluster_tokens, iter=100, size=100, window=5, min_count=5, workers=8, sample=1e-5, hs=0, negative=20)
#model.save(NEW_MODEL_FILE)
end = time.time()
print "Took %s seconds" % (end - start)

Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee
Took 54.3845000267 seconds


In [110]:
for key, phrases in ap_lbl2cluster.items()[0:10]:
    joined = ",".join(sorted(phrases))
    print joined.strip()
    for lbl, sim in model.most_similar(positive=[str(key)], topn=5):
        sim_cl = ap_lbl2cluster[int(lbl)]
        
        joined_2 = ",".join(sorted(sim_cl))
        print sim, "\t" + joined_2
    print ""

body,skeleton,skeletons
0.999792516232 	about,deeper,less,more,self,than
0.999785363674 	called,considered,known,recognized
0.999779939651 	bottom,chart,charts,graph,graphs,guide
0.999778866768 	level,levels,rate,rates
0.999771237373 	according,said,says,stated,stressed

lead,leading,leads
0.999781906605 	carbon,carbon dioxide,carbon dioxide co2,chemicals,co2,energy
0.999781250954 	america,asia,atlantic,australia,pacific,versa
0.999776124954 	limestone,rock,rocks,surface
0.999773979187 	around,away,back,down,loose,off,out,through,together,up
0.999768018723 	atlantic ocean,costal,equatorial,ocean,oceans,pacific ocean,pool,sea,seawater,shallow waters,underwater,upwelling,water,waters

completely,drastically,greatly,overall
0.99977850914 	believe,clear,explain,figure,know,learned,look,matter,say,see,think
0.999767959118 	around,away,back,down,loose,off,out,through,together,up
0.999761164188 	bleached coral,bleaching coral,coral bleaching,coral reef,coral reefs,corals,marine,papua,reef,ree

In [120]:
ix2clusterlbl = dict()
vectors2 = []
for key in model.vocab:
    vec = get_vector(key, model)
    ix = len(vectors2)
    ix2clusterlbl[ix] = key
    vectors2.append(vec)

In [121]:
start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
print("Clustering vectors into clusters via AP")
ap_clusterer2 = AffinityPropagation()
ap2_ids = ap_clusterer2.fit_predict(vectors2)

end = time.time()
print("Creating %i clusters took %i seconds" % (len(set(ap2_ids)), end - start))

Clustering vectors into clusters via AP
Creating 15 clusters took 0 seconds


In [127]:
ap2_clusters = extract_clusters(ap2_ids, ix2clusterlbl)

In [130]:
for key, items in ap2_clusters.items():
    words = set()
    for lbl in items:
        wds = ap_lbl2cluster[int(lbl)]
        words.update(wds)
    joined = ",".join(sorted(words))
    print str(len(items)).ljust(5), joined
    print ""

18    about,above,algae,alot,anchor,anchors,animal,animals,any,begin,body,bring,cant,cold,colder,colonies,come,conclude,conclusion,cooler,coral polyp,coral polyps,creatures,deeper,did,do,does,done,double,eachother,eat,else,end,etc,f,factors,find,finish,fish,food,forced,get,gets,getting,going,got,h2o,high,higher,hotter,human,humans,int,invertebrate,invertebrate animals,lastly,less,let,lower,mean,meaning,means,more,normal,normally,now,o2,often,own,probably,process,recieve,results,right,seem,seems,self,skeleton,skeletons,sometimes,start,started,starts,stinging tentacles,stop,stress,stressors,tend,than,thats,therefore,threaten,threatened,threatens,type,upset,upsets,usually,warm,warmer,wich,work,world,worlds,worse,wrong,you,zooxanthellae

1     case,cases

10    bleach,colder than,different types,different ways,dragged,drop,dropped,dropping,drops,extreme,feed,glucose,higher than,home,keep,keeps,lack,life,limestone,live,lives,living,looses,lose,loses,losing,loss,lost,made,make,makes,making,m