In [1]:
from infgain.corpora import load_example_corpus

### Load text from example corpus.

In [2]:
raw_data = load_example_corpus()

In [3]:
raw_data[:500]

'BACKGROUND: Rivaroxaban is currently used to prevent stroke in patients with atrial fibrillation. Measuring coagulation function may help clinicians to understand the effects of this drug and the associated risk of bleeding. METHODS AND RESULTS: Rivaroxaban was given to 136 patients with non-valvular atrial fibrillation. Mean age was 74.5+/-9.0 years (men: 63.2%) and mean CHADS2 score (+/-SD) was 1.8+/-1.2. Prothrombin times (PTs) and plasma soluble fibrin (SF) levels were examined in 84 out of '

In [4]:
from infgain.preprocessing import sent_tokenize

### Tokenize into sentences.

In [5]:
sentences = sent_tokenize(raw_data)

In [6]:
from infgain.preprocessing import word_tokenize

### Tokenize every sentence into words.

In [7]:
sentences_tokenized = list(map(lambda x: word_tokenize(x), sentences))

In [8]:
from infgain.preprocessing import make_ngrams

### Make ngrams out of every sentence.

In [9]:
ngrams = list(map(lambda x: make_ngrams(x, 3), sentences_tokenized))

In [10]:
from infgain.preprocessing import remove_bad_ngrams

### Remove ngrams containing stopwords.

In [11]:
ngrams = list(map(lambda x: remove_bad_ngrams(x), ngrams))

In [12]:
ngrams[:2]

[[('background',),
  ('rivaroxaban',),
  ('currently',),
  ('used',),
  ('prevent',),
  ('stroke',),
  ('patients',),
  ('atrial',),
  ('fibrillation',),
  ('background', 'rivaroxaban'),
  ('currently', 'used'),
  ('prevent', 'stroke'),
  ('atrial', 'fibrillation')],
 [('measuring',),
  ('coagulation',),
  ('function',),
  ('may',),
  ('help',),
  ('clinicians',),
  ('understand',),
  ('effects',),
  ('drug',),
  ('associated',),
  ('risk',),
  ('bleeding',),
  ('measuring', 'coagulation'),
  ('coagulation', 'function'),
  ('function', 'may'),
  ('may', 'help'),
  ('help', 'clinicians'),
  ('associated', 'risk'),
  ('measuring', 'coagulation', 'function'),
  ('coagulation', 'function', 'may'),
  ('function', 'may', 'help'),
  ('may', 'help', 'clinicians')]]

In [13]:
from infgain.preprocessing import make_dict

### Make dictionary out of ngrams.

In [14]:
dictionary, reversed_dictionary = make_dict(ngrams)

### Replace OOV-words with UNK

In [50]:
from infgain.preprocessing import replace_unk_list

sentences_unked = list(map(lambda x: replace_unk_list(x, dictionary), sentences_tokenized))
sentences_unked = list(filter(lambda x: len(x) > 0, sentences_unked))

In [51]:
from infgain.preprocessing import make_tupled_dicts

dict_ngram_id_by_tuple, dict_tilda_id_by_tuple = make_tupled_dicts(dictionary)

In [52]:
from infgain.preprocessing import ngram_id, tilda_id

ngram_id((3,4), dict_ngram_id_by_tuple), ngram_id((3,4), dict_tilda_id_by_tuple)

(10, 16568)

In [53]:
reversed_dictionary[16568], reversed_dictionary[10], reversed_dictionary[3], reversed_dictionary[4]

('currently~used', 'currently_used', 'currently', 'used')

# Try to train word2gauss model

In [54]:
from word2gauss import GaussianEmbedding, iter_pairs
from word2gauss.words import Vocabulary

vocab_gauss = Vocabulary(dictionary)
embed = GaussianEmbedding(len(dictionary), 100,
                          covariance_type='diagonal',
                          energy_type='KL')

In [55]:
#TODO: Rewrite more wisely (i forgot how to wrap correctly such functions)

def ngram_id_wrap(x):
    return ngram_id(x, dict_ngram_id_by_tuple)

def tilda_id_wrap(x):
    return tilda_id(x, dict_tilda_id_by_tuple)

In [56]:
%%time
embed.train(
    iter_pairs(sentences_unked[:1000], vocab_gauss, ngram_id_wrap,
               tilda_id_wrap, batch_size=10, nsamples=2,
               window=5, n=3),
    n_workers=8)

CPU times: user 16.3 s, sys: 2.4 s, total: 18.8 s
Wall time: 17.8 s


In [58]:
def nearest(token, num=10):
    return [x["word"] for x in embed.nearest_neighbors([(token,)], vocab=vocab_gauss, num=num)]

In [59]:
nearest('normal_breast_tissue')

['normal_breast_tissue',
 'hypotension',
 'propafenone',
 'egfr~therapy',
 'c_virus_hcv',
 'analysis~sensitivity',
 'cystic~fibrosis~transmembrane',
 'analysis~regression',
 'cases_NUMBER',
 'interfering_rna']

In [61]:
nearest('amino_acid')

['amino_acid',
 'subjugated',
 'state~steady',
 'associated~significantly',
 'pcr_positive',
 'cells~dendritic',
 'characterize',
 'genes~pmqr',
 'agarose_gel',
 'anti_egfr']

In [62]:
nearest('wistar')

['wistar',
 'circular',
 'cell_based',
 'glucose_uptake',
 'phase',
 'g~phu',
 'procedures~surgical',
 'drug~resistance',
 'activation~platelet',
 'ampa~receptors']