In [1]:
#params
ROOT_FOLDER = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"

DOCS_FOLDER = "%s/ProcessedDocs" % ROOT_FOLDER
FILE_MASK = ".*\.txt"

MIN_DOC_FREQ = 10
MAX_PHRASE_LEN = 10
STOP_WORDS_FILE = "%s/en_stop_words.txt" % ROOT_FOLDER
PHRASES_FILE    = "%s/Phrases.txt" % ROOT_FOLDER
PHRASE_FREQ_FILE = "%s/phrase_freq.txt" % ROOT_FOLDER

In [4]:
def find_files(folder, regex, remove_empty = False):
    """
    Find all files matching the [regex] pattern in [folder]

    folder  :   string
                    folder to search (not recursive)
    regex   :   string (NOT regex object)
                    pattern to match
    """
    files = os.listdir(folder)
    matches = [os.path.abspath(os.path.join(folder, f))
               for f in files
               if re.search(regex, f, re.IGNORECASE)]

    if remove_empty:
        matches = [f for f in matches if os.path.getsize(f) > 0]
    matches.sort()
    return matches

In [5]:
import os, re, time
start = time.time()

files = find_files(DOCS_FOLDER, FILE_MASK, True)
print("%s files found in %s" % (len(files), DOCS_FOLDER))
documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        documents.append(contents.split("\n"))
end = time.time()
print("Loading %i documents took %s seconds" % (len(files), str(end - start)))

1093 files found in /Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis/ProcessedDocs
Loading 1093 documents took 0.548434972763 seconds


### Flatten Documents Into a List of Sentences

In [12]:
from IterableFP import flatten
flattened = flatten(documents)
len(documents), sum(map(len,documents)), len(flattened)

(1093, 10112, 10112)

### Write into a Single File to Train the Model

In [13]:
merged_file = ROOT_FOLDER + "/merged_documents.txt"
with open(merged_file, "w+") as f:
    for sent in flattened:
        f.write("{sent}\n".format(sent=sent))

In [14]:
import fasttext

# Skipgram model
model = fasttext.skipgram(merged_file, 'model')

In [19]:
[att for att in dir(model) if att[0] != "_"]

['bucket',
 'cosine_similarity',
 'dim',
 'encoding',
 'epoch',
 'loss_name',
 'lr_update_rate',
 'maxn',
 'min_count',
 'minn',
 'model_name',
 'neg',
 't',
 'word_ngrams',
 'words',
 'ws']

In [166]:
list(map(str,model.words))[0:10]

['secondly',
 'all',
 'surrounded',
 'customary',
 'skeleton',
 'chain',
 'zooxantheallae',
 'caused',
 'lack',
 'results']

In [165]:
len(model["u'zooxantheallae"])

100

In [119]:
words = sorted(map(str,model.words))
vectors = [ model[wd] for wd in words]
words[-1], len(vectors[-1])

('zooxanthelle', 100)

In [120]:
import numpy as np

def to_unit(vec):
    a = np.asarray(vec)
    norm = np.linalg.norm(a)
    return a / norm

unit_vects = [to_unit(v) for v in vectors]
npvects = np.asarray(unit_vects)
npvects.shape

(1099, 100)

In [146]:
sim_mat = np.dot(npvects, npvects.T)
sort = np.argsort(sim_mat, 1)
npwords = np.asarray(words)

def sim_oov(word, n):
    print("OOV: " + word)
    vect = to_unit(np.asarray(model[word]))
    sims = np.asarray([np.dot(vect, v) for v in npvects])
    sort = np.argsort(sims)
    best_ixs = sort[::-1][:n]
    return zip(list(npwords[best_ixs]), sims[best_ixs])

def sim(word, n):
    found = False
    for ix, wd in enumerate(words):
        if wd == word:
            found = True
            break
    if not found:
        return sim_oov(word, n)
    
    best_ixs = sort[ix][::-1][1:1+n]
    sims = []
    for bix in best_ixs:
        sims.append(sim_mat[ix, bix])
    return zip(list(npwords[best_ixs]), sims)

In [122]:
words[100:110]

['begins',
 'being',
 'believe',
 'below',
 'benefit',
 'benefits',
 'besides',
 'best',
 'better',
 'between']

In [160]:
sim("carbon", 15)

[('dioxide', 0.98619950633326869),
 ('h2o', 0.96418850774736065),
 ('h20', 0.93600624330409432),
 ('combine', 0.92501390725069399),
 ('sunlight', 0.92361746602342687),
 ('o2', 0.91614789155348852),
 ('co2', 0.91537846575742943),
 ('uses', 0.89975905284668867),
 ('perform', 0.89127001041832687),
 ('sun', 0.88131955612913482),
 ('oxygen', 0.87667874229120557),
 ('energy', 0.87654803077455057),
 ('undergo', 0.87645247346046096),
 ('forms', 0.87040642972755777),
 ('glucose', 0.85853073001284541)]

In [161]:
sim("CO2", 15)

OOV: CO2


[('carbon', 0.13456267074009276),
 ('dioxide', 0.1331747726512979),
 ('combine', 0.11780791362204424),
 ('region', 0.1159234558747855),
 ('increases', 0.10892794568463748),
 ('co2', 0.10845458176318787),
 ('normal', 0.10821249078154474),
 ('uses', 0.10746493251727182),
 ('combination', 0.10497513351146462),
 ('decreases', 0.10124439334279565),
 ('h2o', 0.10029915998011496),
 ('sugars', 0.10004220878036442),
 ('information', 0.099676239341521686),
 ('formation', 0.099559469696993405),
 ('f', 0.095265474336958228)]

In [162]:
sim("reef", 15)

[('building', 0.8777039612138563),
 ('reefs', 0.83572360524295131),
 ('85o', 0.81726598841211051),
 ('sensitive', 0.81512361039371029),
 ('receive', 0.81279670545082605),
 ('85f', 0.80730541539214118),
 ('85of', 0.80297436802084809),
 ('00', 0.80037764887466178),
 ('need', 0.79395965516832434),
 ('reefbase', 0.78395979147477257),
 ('degrees', 0.7772631882226595),
 ('process', 0.76893028551598241),
 ('degree', 0.7669855115819828),
 ('recieve', 0.76680169161120881),
 ('work', 0.75328544035764622)]

In [170]:
sim("white", 15)

[('turns', 0.991513863250301),
 ('turn', 0.96825131707259848),
 ('turning', 0.91270043611615925),
 ('leaves', 0.89751763502791682),
 ('bleach', 0.89398285270311129),
 ('due', 0.8939353581564583),
 ('bleached', 0.8919246513078517),
 ('begin', 0.8914034486879544),
 ('dies', 0.88919053746041932),
 ('die', 0.88137499947020959),
 ('leave', 0.88048577449405563),
 ('turned', 0.87790464345089358),
 ('death', 0.87499876747401273),
 ('loose', 0.87462611749150787),
 ('dead', 0.87156491447809825)]