### Load text from example corpus.

In [1]:
from infgain.corpora import load_example_corpus

raw_data = load_example_corpus()
raw_data[:500]

'BACKGROUND: Rivaroxaban is currently used to prevent stroke in patients with atrial fibrillation. Measuring coagulation function may help clinicians to understand the effects of this drug and the associated risk of bleeding. METHODS AND RESULTS: Rivaroxaban was given to 136 patients with non-valvular atrial fibrillation. Mean age was 74.5+/-9.0 years (men: 63.2%) and mean CHADS2 score (+/-SD) was 1.8+/-1.2. Prothrombin times (PTs) and plasma soluble fibrin (SF) levels were examined in 84 out of '

### Tokenize into sentences.

In [5]:
from infgain.preprocessing import sent_tokenize

sentences = sent_tokenize(raw_data)
sentences[:2]

['BACKGROUND: Rivaroxaban is currently used to prevent stroke in patients with atrial fibrillation.',
 'Measuring coagulation function may help clinicians to understand the effects of this drug and the associated risk of bleeding.']

### Test BIOPHRASES

In [6]:
import biophrases

In [7]:
biophrases.tfidf.tf_idf(sentences, 3)

[('polymerase chain reaction', 11.362448566739916),
 ('95 confidence interval', 7.922701379195197),
 ('play important role', 5.2094924286619744),
 ('plays important role', 4.9733519828839485),
 ('reactive oxygen species', 4.8987498711889721),
 ('squamous cell carcinoma', 4.6722764844803182),
 ('chain reaction pcr', 4.5712127978658748),
 ('tumor necrosis factor', 3.7453355208694652),
 ('enzyme linked immunosorbent', 3.6802877553304598),
 ('linked immunosorbent assay', 3.5508129375336779),
 ('growth factor receptor', 3.4363087722726569),
 ('epstein barr virus', 3.4084036980924823),
 ('nitric oxide synthase', 3.2601624601371797),
 ('magnetic resonance imaging', 3.2245201280067755),
 ('breast cancer cells', 3.2004304253117635),
 ('oxygen species ros', 3.1848754015372895),
 ('green fluorescent protein', 3.0815272367882782),
 ('associated increased risk', 3.0527133910069222),
 ('aim study investigate', 3.0508569871681916),
 ('human immunodeficiency virus', 3.0195425927917272),
 ('truncated 2

In [8]:
biophrases.graphs.bigrams_from_all(sentences, 'pagerank')

[]

In [15]:
biophrases.graphs.bigrams_from_best(sentences[:100], 'pagerank', t=3)

[(('inderr', 'inderr'), 0)]

In [31]:
biophrases.graphs.bigrams_from_all(sentences_tokenized, 'pagerank')

[(('NUMBER', 'cells'), 0.029928714815935654),
 (('patients', 'NUMBER'), 0.029824204927204428),
 (('NUMBER', 'cell'), 0.029745080292629726),
 (('NUMBER', 'protein'), 0.028919252493583326),
 (('NUMBER', 'study'), 0.028816424213198605),
 (('NUMBER', 'expression'), 0.028812454762573478),
 (('results', 'NUMBER'), 0.02871279654376853),
 (('NUMBER', 'using'), 0.028677287880700193),
 (('NUMBER', 'two'), 0.028504499719120627),
 (('may', 'NUMBER'), 0.028473647739997474),
 (('NUMBER', 'activity'), 0.028446894424177405),
 (('NUMBER', 'gene'), 0.028409954670881428),
 (('associated', 'NUMBER'), 0.028339216506612808),
 (('NUMBER', 'also'), 0.028324795223895666),
 (('NUMBER', 'induced'), 0.02825179510871247),
 (('NUMBER', 'treatment'), 0.028210546906832255),
 (('NUMBER', 'increased'), 0.02818848339171159),
 (('NUMBER', 'data'), 0.028167820018612583),
 (('NUMBER', 'acid'), 0.028155777506314084),
 (('NUMBER', 'human'), 0.02814604638661225),
 (('used', 'NUMBER'), 0.028144798849847254),
 (('NUMBER', 'type

(('inderr', 'inderr'), 0)

### Tokenize every sentence into words.

In [25]:
from infgain.preprocessing import word_tokenize

sentences_tokenized = list(map(lambda x: word_tokenize(x), sentences))

### Make ngrams out of every sentence.

In [4]:
from infgain.preprocessing import make_ngrams

ngrams = list(map(lambda x: make_ngrams(x, 3), sentences_tokenized))

### Remove ngrams containing stopwords.

In [5]:
from infgain.preprocessing import remove_bad_ngrams

ngrams = list(map(lambda x: remove_bad_ngrams(x), ngrams))

In [6]:
ngrams[:2]

[[('background',),
  ('rivaroxaban',),
  ('currently',),
  ('used',),
  ('prevent',),
  ('stroke',),
  ('patients',),
  ('atrial',),
  ('fibrillation',),
  ('background', 'rivaroxaban'),
  ('currently', 'used'),
  ('prevent', 'stroke'),
  ('atrial', 'fibrillation')],
 [('measuring',),
  ('coagulation',),
  ('function',),
  ('may',),
  ('help',),
  ('clinicians',),
  ('understand',),
  ('effects',),
  ('drug',),
  ('associated',),
  ('risk',),
  ('bleeding',),
  ('measuring', 'coagulation'),
  ('coagulation', 'function'),
  ('function', 'may'),
  ('may', 'help'),
  ('help', 'clinicians'),
  ('associated', 'risk'),
  ('measuring', 'coagulation', 'function'),
  ('coagulation', 'function', 'may'),
  ('function', 'may', 'help'),
  ('may', 'help', 'clinicians')]]

### Make dictionary out of ngrams.

In [7]:
from infgain.preprocessing import make_dict

dictionary, reversed_dictionary = make_dict(ngrams)

### Replace OOV-words with UNK.

In [8]:
from infgain.preprocessing import replace_unk_list

sentences_unked = list(map(lambda x: replace_unk_list(x, dictionary), sentences_tokenized))
sentences_unked = list(filter(lambda x: len(x) > 0, sentences_unked))

### Make dictionaries of tuples -> token_id.

In [9]:
from infgain.preprocessing import make_tupled_dicts

dict_ngram_id_by_tuple, dict_tilda_id_by_tuple = make_tupled_dicts(dictionary)

In [10]:
from infgain.preprocessing import ngram_id, tilda_id

# TODO: Rewrite more wisely (i forgot how to wrap correctly such functions)!
def ngram_id_wrap(x):
    return ngram_id(x, dict_ngram_id_by_tuple)

def tilda_id_wrap(x):
    return tilda_id(x, dict_tilda_id_by_tuple)

ngram_id((3,4), dict_ngram_id_by_tuple), tilda_id((3,4), dict_tilda_id_by_tuple)

(10, 21033)

In [11]:
reversed_dictionary[20510], reversed_dictionary[10], reversed_dictionary[3], reversed_dictionary[4]

('high~throughput', 'currently_used', 'currently', 'used')

### Try to train word2gauss model.

In [22]:
from word2gauss import GaussianEmbedding, iter_pairs
from word2gauss.words import Vocabulary

vocab_gauss = Vocabulary(dictionary)

embed = GaussianEmbedding(len(dictionary),
                          100,
                          covariance_type='diagonal',
                          energy_type='KL')

In [23]:
sentences_unked = list(filter(lambda x: len(x.split()) > 1, sentences_unked))

In [24]:
%%time
embed.train(
    iter_pairs(sentences_unked[:10000], vocab_gauss, ngram_id_wrap,
               tilda_id_wrap, batch_size=10, nsamples=2,
               window=5, n=3),
    n_workers=8)

CPU times: user 2min 34s, sys: 19.3 s, total: 2min 53s
Wall time: 2min 37s


In [25]:
def nearest(token, num=10):
    return [x["word"] for x in embed.nearest_neighbors([(token,)], vocab=vocab_gauss, num=num)]

In [26]:
nearest('amino_acid')

['amino_acid',
 'blocker',
 'subjugated',
 'NUMBER~heme~oxygenase',
 'pcr_positive',
 'arginine~l~nitro',
 'agarose_gel',
 'anti_egfr',
 'food~intake',
 'short~time']

### Here goes KL-divergence.

In [27]:
from infgain.metrics import score_ngrams

closed_kl_ngram_token = score_ngrams(dictionary, embed, 0, 0)
closed_kl_token_ngram = score_ngrams(dictionary, embed, 0, 1)

variational_kl_ngram_token = score_ngrams(dictionary, embed, 1, 0)
variational_kl_token_ngram = score_ngrams(dictionary, embed, 1, 1)

In [29]:
import pandas as pd

In [35]:
def firstonly(tuples):
    return list(map(lambda x: x[0], tuples))

In [36]:
topn = 20
data = {'Closed KL(ngram, tilda)': firstonly(closed_kl_ngram_token[:topn]),
        'Closed KL(tilda, ngram)': firstonly(closed_kl_token_ngram[:topn]),
        'Variational KL(ngram, tilda)': firstonly(variational_kl_ngram_token[:topn]),
        'Variational KL(tilda, ngram)': firstonly(variational_kl_token_ngram[:topn])}

In [37]:
pd.DataFrame(data)

Unnamed: 0,"Closed KL(ngram, tilda)","Closed KL(tilda, ngram)","Variational KL(ngram, tilda)","Variational KL(tilda, ngram)"
0,adp_transport,findings_demonstrate,substantia_nigra,substantia_nigra
1,dax_NUMBER,review_describes,ssa_ps,s1_nuclease
2,NUMBER_aryl,cephalexin_r,bipolar_mood,tick_borne
3,ves_v_NUMBER,NUMBER_aryl,downy_mildew,dc_dg
4,review_describes,allergic_patients,eu3_nps,postural_taping
5,reverse_transcribed,psur_reporting,ethylene_glycol,antiphospholipid_antibodies
6,rapid_increase,also_affected,dinor_pgs,ns5a_ns5a
7,alpha_epsilon_lys,rapid_increase,definitive_hosts,ssa_ps
8,mg_bid,rac_alpha_tocopheryl,pamcl_atropine,fragrant_pear
9,selenium_selenium,dax_NUMBER,mirna_signatures,hud_expressing
