In [43]:
# Settings
ROOT_FOLDER = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"

DOCS_FOLDER     = "%s/ProcessedDocs" % ROOT_FOLDER
STOP_WORDS_FILE = "%s/minor_en_stop_words.txt" % ROOT_FOLDER
PHRASES_FILE    = "%s/Phrases.txt" % ROOT_FOLDER
KEY_WORDS_FILE  = None

MODEL_FILE = "%s/model.w2v" % ROOT_FOLDER
FILE_MASK = ".*\.txt"
MIN_SENT_LENGTH = 5

# W2vec settings
MIN_WD_COUNT = 3 #for word2vec model # setting to 10 seems to remove some of the noise
WINDOW_SIZE  = 5
VECTOR_SIZE  = 300
WORKERS = 8
TRAINING_ITERATIONS = 300

In [44]:
# Shared
import re
from collections import defaultdict

def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

re_collapse_spaces = re.compile("\s+")
def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s)

re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
    s = str(s).replace("'s"," ")
    #doesn't work in regex
    s = s.replace("-", " ").replace("\\"," ")
    s = re1.sub(" ",s).strip()
    return collapse_spaces(s)

# is a valid token
__bad_chars__ = "<>{}[]~@"
__punct__ = set(".?!,;:")
def is_valid_term(term):
    # remove single char entries and only numeric
    if len(term) == 0:
        return False
    if len(term) == 1:
        #else misses c and r
        if term.isalpha():
            return True
        return False
    # no html or js terms
    for c in __bad_chars__:
        if c in term:
            return False
    if term[-1] in __punct__:
        return False
    if "function(" in term:
        return False
    if "!" in term or "?" in term:
        return False
    digit_chars = 0.0
    for c in term:
        if c.isdigit() or not c.isalnum():
            digit_chars += 1.0
    # 60% digits?
    if (digit_chars / len(term)) >= 0.75:
        return False
    return True

def find_files(folder, regex, remove_empty = False):
    """
    Find all files matching the [regex] pattern in [folder]

    folder  :   string
                    folder to search (not recursive)
    regex   :   string (NOT regex object)
                    pattern to match
    """
    files = os.listdir(folder)
    matches = [os.path.abspath(os.path.join(folder, f))
               for f in files
               if re.search(regex, f, re.IGNORECASE)]

    if remove_empty:
        matches = [f for f in matches if os.path.getsize(f) > 0]
    matches.sort()
    return matches

In [45]:
from collections import defaultdict

class SynonymMapper(object):
    def __init__(self, mapper, nested, case_sensitive=False):
        self.case_sensitive = case_sensitive
        self.mapper = mapper
        self.nested = nested
        self.synonyms = set()
        for rhs in self.mapper.values():
            for syn in rhs:
                self.synonyms.add(syn)
        
    def is_synonym(self, term):
        return term in self.synonyms
        
    def map_synonyms(self, tokens, debug=False):
        mapped = []
        size = len(tokens)
        if not self.case_sensitive:
            tmp_tokens = map(lambda s: s.lower(), tokens)
        else:
            tmp_tokens = tokens
        ix = 0
        while ix < size:
            if debug:
                print "ix", ix
            best, best_key = None, None
            tmp_ix = ix        
            max_ix = ix
            current = ""
            d = self.nested
            while tmp_ix < size and tmp_tokens[tmp_ix] in d:
                current += tmp_tokens[tmp_ix] + " "
                key = current.strip()
                if key in self.mapper:
                    if debug:
                        if best is not None:
                            print(ix, tmp_ix, "new best:", key, "=>", self.mapper[key])
                        else:
                            print(ix, tmp_ix, "best:", key, "=>", self.mapper[key])
                    best = self.mapper[key]
                    best_key = key
                    max_ix = tmp_ix                    
                d = d[tmp_tokens[tmp_ix]]
                tmp_ix += 1
            if not best:
                #retain original casing
                mapped.append(tokens[ix])
            else:
                ix = max_ix
                #yields a set
                for item in sorted(best):
                    mapped.append(item)
            ix += 1
        return mapped

    def __repr__(self):
        return "Synonym Mapper: %i synonyms mapped" % len(self.mapper)

def build_synonym_filter(files, case_sensitive=False):
    # recursively define a defaultdict generator
    mapper = defaultdict(set)
    def dd():
        return defaultdict(dd)
    nested_map = defaultdict(dd)
    file_locn = dict()
    if type(files) == str:
        files = [files]
    for f in files:
        with open(f, "r+") as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 0 and not line[0] == "#":
                    if "=>" in line:
                        left, right = line.split("=>")
                        right = set(right.split(","))
                        left_parts = left.split(",")
                    else:
                        left_parts = line.split(",")
                        right = set(left_parts)

                    for syn in left_parts:
                        for rhs in right:
                            mapper[syn].add(rhs)
                        file_locn[syn] = f

                        tokens = syn.split(" ")
                        prev = tokens[0]
                        d = nested_map[prev]
                        for token in tokens[1:]:
                            d = d[token]
                            prev = token                        
    return SynonymMapper(mapper, nested_map, case_sensitive)

In [46]:
#String processing
def white_space_tokenize(s):
    return s.split(" ")

__punct__ = set(".?!,;:")
def remove_punct_at_end(s):
    while len(s) > 1 and s[-1] in __punct__:
        s = s[:-1]
    return s

#Token Filters
def fact_len_filter(max_len):
    def len_filter(tokens):
        return filter(lambda s: len(s) >= max_len, tokens)
    return len_filter

remove_empty_tokens_filter = fact_len_filter(1)

def lower_case_filter(tokens):
    if type(tokens) == str:
        return tokens.lower()
    return map(lambda t: t.lower(), tokens)

__punct__ = set(".?!,;:")

def remove_punct_at_end_filter(tokens):
    return map(remove_punct_at_end, tokens)

def fact_is_synonym_filter(syn_mapper):
    def is_synonym_filter(tokens):
        return filter(syn_mapper.is_synonym, tokens)
    return is_synonym_filter

def valid_term_filter(tokens):
    return filter(is_valid_term, tokens) 

In [47]:
def fact_stop_word_filter(case_sensitive, stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
        for line in f:
            word = line.strip()
            if word[0] != "#":
                if not case_sensitive:
                    word = word.lower()
                stop_words.add(word)

    def cs_stop_filter(tokens):
        return [tok for tok in tokens if tok not in stop_words]

    def stop_filter(tokens):
        return [tok for tok in tokens if tok.lower() not in stop_words]

    if case_sensitive:
        return cs_stop_filter
    else:
        return stop_filter

stop_filter = fact_stop_word_filter(False, STOP_WORDS_FILE)

In [48]:
def analyze(s, filters):
    temp = s
    for f in filters:
        temp = f(temp)
    return temp

def debug_analyze(s, filters):
    temp = s
    pad = 20
    print "START".ljust(pad), temp
    for f in filters:
        temp = f(temp)
        if type(temp) == list:
            s_temp = "|".join(map(str,temp))
        else:
            s_temp = str(temp)
        print f.func_name.ljust(pad), s_temp
    return temp

In [49]:
if KEY_WORDS_FILE:
    syn_mapper = build_synonym_filter([PHRASES_FILE, KEY_WORDS_FILE], False)
else:
    syn_mapper = build_synonym_filter([PHRASES_FILE], False)
syn_mapper

Synonym Mapper: 1194 synonyms mapped

In [50]:
#Skills from text
is_a_synonym_filter = fact_is_synonym_filter(syn_mapper)
analysis_chain = [clean_str,
                  white_space_tokenize,
                  remove_punct_at_end_filter,
                  #valid_term_filter,
                  lower_case_filter,
                  stop_filter,
                  syn_mapper.map_synonyms, 
                  remove_empty_tokens_filter]
                  #, is_a_synonym_filter] - just train the model longer

#Test
rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management", 
                     analysis_chain)

START                $150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT "HOT" dev. -IBM's business, sql server management
clean_str            $150k as400 Sr. Java j2ee and the C#.! developer. FIT HOT dev. IBM business sql server management
white_space_tokenize $150k|as400|Sr.|Java|j2ee|and|the|C#.!|developer.|FIT|HOT|dev.|IBM|business|sql|server|management
remove_punct_at_end_filter $150k|as400|Sr|Java|j2ee|and|the|C#|developer|FIT|HOT|dev|IBM|business|sql|server|management
lower_case_filter    $150k|as400|sr|java|j2ee|and|the|c#|developer|fit|hot|dev|ibm|business|sql|server|management
stop_filter          $150k|as400|sr|java|j2ee|c#|developer|fit|hot|dev|ibm|business|sql|server|management
map_synonyms         $150k|as400|sr|java|j2ee|c#|developer|fit|hot|dev|ibm|business|sql|server|management
len_filter           $150k|as400|sr|java|j2ee|c#|developer|fit|hot|dev|ibm|business|sql|server|management


In [51]:
import os, re, time
start = time.time()

sentences = []
files = find_files(DOCS_FOLDER, FILE_MASK, True)
print("%s files found in %s" % (len(files), DOCS_FOLDER))

documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        sentences.extend(contents.split("\n"))
end = time.time()
print("Loading %i sentences took %s seconds" % (len(sentences), str(end - start)))

1093 files found in /Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis/ProcessedDocs
Loading 10112 sentences took 0.185217142105 seconds


In [52]:
print len(sentences)
tokenized = []
print("Tokenizing sentences")
for i, sent in enumerate(sentences):
    tokens = analyze(sent, analysis_chain)
    if len(tokens) >= MIN_SENT_LENGTH:
        tokenized.append(tokens)
    if i % 1000 == 0:
        print(i)

10112
Tokenizing sentences
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [53]:
for t in tokenized[0:100]:
    print "|".join(t)

what leads|to|differences|in|rates|of|coral bleaching
coral|is|often mistaken|for|rock|it|is|made up|of|tiny animals called polyps
coral bleaching|shows|bleaching|healthy|bleaching event|that|makes coral vulnerable|to|disease starvation
coral bleaching|is|almost|noticeable|in|pacific ocean
part|of|coral|called|zooanthellae|are|not|getting|sunlight
if|they|get|much|sunlight|they|start|to|loses|their|color
reason why|is|because|zooanthellae|if|not|getting|up|with|carbon dioxide
coral|also|need water temperature|because|they|are|sensitive
also|its|threats|for|us|because|means|that|more|extreme storms|are|most|likely|to|occurs
also|water|us|getting|to|salty
well|based|on|what|i|read|corals|are|their|colors|coral bleaching|are|serious problem|with|serious impact|on|worlds coral reefs
this|is|serious problem|because|coral bleaching|is|most noticeable|in|pacific ocean|ocean covers about|0|0|of|entire globe|some|corals|are|sensitive|to|how|salty|water|us
massive coral bleaching event|in|0000|i

# Train Model

In [54]:
import gensim, time
from gensim.models.word2vec import Word2Vec

start = time.time()

print("Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee")
model = Word2Vec(tokenized, iter=TRAINING_ITERATIONS, size=VECTOR_SIZE, window=WINDOW_SIZE, min_count=MIN_WD_COUNT, workers=WORKERS, sample=1e-5, hs=0, negative=20)
model.save(MODEL_FILE)
end = time.time()
print "Took %s seconds" % (end - start)

Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee
Took 1335.32123804 seconds


In [56]:
tokens = map(lambda s: s.strip(), "coral, bleaching, coral bleaching, white, trade winds, salinity, death, dying, fresh water, storms, rainfall, water temperatures, temps".split(","))
for token in tokens:
    print token
    if token not in model.vocab:
        print "Token not in vocab\n"
        continue
    freq = 0
    for pair in model.most_similar(positive=[token], topn=100):
        if syn_mapper.is_synonym(pair[0]):
            print "\t", pair
            freq += 1
            if freq >= 20:
                break
    print ""

coral
	('algae', 0.8424152135848999)
	('corals', 0.8412076234817505)
	('off', 0.788943886756897)
	('finally', 0.7392733097076416)
	('zooxanthellae', 0.7365037798881531)
	('so', 0.7308151721954346)
	('coral gets', 0.7219845056533813)
	('being', 0.7218189835548401)
	('out', 0.7132343649864197)
	('food', 0.7000315189361572)
	('relies', 0.6980425715446472)
	('nutrient rich environment', 0.6968252658843994)
	('survival', 0.6960312128067017)
	('stay healthy', 0.6959972381591797)
	('algae provides', 0.6952272653579712)
	('like', 0.6939720511436462)
	('get', 0.6934082508087158)
	('reason why', 0.6928682327270508)
	('coral so', 0.6912604570388794)
	('any', 0.6900298595428467)

bleaching
	('conclusion coral', 0.8251010179519653)
	('coral bleaching occurs', 0.7763916254043579)
	('likely', 0.767128050327301)
	('big problem', 0.7635579109191895)
	('result', 0.7597436904907227)
	('takes place', 0.7582629919052124)
	('alot', 0.7567083239555359)
	('makes coral', 0.7540161609649658)
	('event', 0.753566