DataReader <- Tokenizer

Model <- TokenEmbedder

text -> sentences -> tokenized sentences -> 

### Load text from example corpus.

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from infgain.corpora import load_example_corpus

raw_data = load_example_corpus()
raw_data[:500]

'BACKGROUND: Rivaroxaban is currently used to prevent stroke in patients with atrial fibrillation. Measuring coagulation function may help clinicians to understand the effects of this drug and the associated risk of bleeding. METHODS AND RESULTS: Rivaroxaban was given to 136 patients with non-valvular atrial fibrillation. Mean age was 74.5+/-9.0 years (men: 63.2%) and mean CHADS2 score (+/-SD) was 1.8+/-1.2. Prothrombin times (PTs) and plasma soluble fibrin (SF) levels were examined in 84 out of '

### Tokenize into sentences.

In [5]:
from infgain.preprocessing import sent_tokenize

sentences = sent_tokenize(raw_data)
sentences[:2]

['BACKGROUND: Rivaroxaban is currently used to prevent stroke in patients with atrial fibrillation.',
 'Measuring coagulation function may help clinicians to understand the effects of this drug and the associated risk of bleeding.']

### Test BIOPHRASES

In [10]:
import biophrases

In [35]:
tfidf_scored_ngrams = biophrases.tfidf.tf_idf(sentences_strings, n=2)  # `n` determines length of ngrams

In [71]:
pagerank_scored_ngrams = biophrases.graphs.bigrams_from_best(sentences_tokenized[:10], 'pagerank', t=3)

In [72]:
pagerank_scored_ngrams

[(('NUMBER', 'patients'), 0.4402907389516951),
 (('atrial', 'fibrillation'), 0.4158485325475557),
 (('NUMBER', 'years'), 0.3123757729046728),
 (('bleeding', 'events'), 0.2884007115986025),
 (('high', 'brain'), 0.1509092988282213),
 (('associated', 'risk'), 0.12779178859351065),
 (('peak', 'times'), 0.11110170056431382),
 (('times', 'namely'), 0.09116495359563193)]

### Tokenize every sentence into words.

In [16]:
from infgain.preprocessing import word_tokenize

sentences_tokenized = list(map(lambda x: word_tokenize(x), sentences))
sentences_tokenized[3:4]

[['mean',
  'age',
  'was',
  'NUMBER',
  'years',
  'men',
  'NUMBER',
  'and',
  'mean',
  'chads2',
  'score',
  'sd',
  'was',
  'NUMBER']]

### Make dictionaries out of words.

In [42]:
from infgain.preprocessing import make_dict

dictionary, reversed_dictionary = make_dict(sentences_tokenized)

In [49]:
dictionary['amino_acid'], dictionary['acid~amino'], dictionary['amino'], dictionary['acid']

(3706, 16233, 3705, 1078)

### Replace OOV-words with UNK.

In [58]:
from infgain.preprocessing import replace_unk_list

sentences_unked = list(map(lambda x: replace_unk_list(x, dictionary), sentences_tokenized))
sentences_unked = list(filter(lambda x: len(x) > 0, sentences_unked))
sentences_unked = list(filter(lambda x: len(x.split()) > 1, sentences_unked))

### Make dictionaries of tuples -> token_id.

In [9]:
from infgain.preprocessing import make_tupled_dicts

dict_ngram_id_by_tuple, dict_tilda_id_by_tuple = make_tupled_dicts(dictionary)

In [10]:
from infgain.preprocessing import ngram_id, tilda_id

# TODO: Rewrite more wisely (i forgot how to wrap correctly such functions)!
def ngram_id_wrap(x):
    return ngram_id(x, dict_ngram_id_by_tuple)

def tilda_id_wrap(x):
    return tilda_id(x, dict_tilda_id_by_tuple)

ngram_id((3,4), dict_ngram_id_by_tuple), tilda_id((3,4), dict_tilda_id_by_tuple)

(10, 21033)

In [11]:
reversed_dictionary[20510], reversed_dictionary[10], reversed_dictionary[3], reversed_dictionary[4]

('high~throughput', 'currently_used', 'currently', 'used')

### Try to train word2gauss model.

In [22]:
from word2gauss import GaussianEmbedding, iter_pairs
from word2gauss.words import Vocabulary

vocab_gauss = Vocabulary(dictionary)

embed = GaussianEmbedding(len(dictionary),
                          100,
                          covariance_type='diagonal',
                          energy_type='KL')

In [23]:
sentences_unked = list(filter(lambda x: len(x.split()) > 1, sentences_unked))

In [24]:
%%time
embed.train(
    iter_pairs(sentences_unked[:10000], vocab_gauss, ngram_id_wrap,
               tilda_id_wrap, batch_size=10, nsamples=2,
               window=5, n=3),
    n_workers=8)

CPU times: user 2min 34s, sys: 19.3 s, total: 2min 53s
Wall time: 2min 37s


In [25]:
def nearest(token, num=10):
    return [x["word"] for x in embed.nearest_neighbors([(token,)], vocab=vocab_gauss, num=num)]

In [26]:
nearest('amino_acid')

['amino_acid',
 'blocker',
 'subjugated',
 'NUMBER~heme~oxygenase',
 'pcr_positive',
 'arginine~l~nitro',
 'agarose_gel',
 'anti_egfr',
 'food~intake',
 'short~time']

# _SHORT__ version of model training

In [157]:
import time
from word2gauss import GaussianEmbedding, iter_pairs
from word2gauss.words import Vocabulary

import biophrases
from infgain.preprocessing import ngram_id, tilda_id, make_tupled_dicts, replace_unk_list
from infgain.metrics import score_ngrams


def get_scored_ngrams(tokenized_sentences: list, measure: str):
    """
    :param measure: 'pagerank', 'tfidf', 'closed_ngram_tilda', 'closed_tilda_ngram',
                    'variational_ngram_tilda', 'variational_tilda_ngram'
    """
    print('Starting... {}'.format(measure))
    t1 = time.time()
    if measure is 'pagerank':
        pagerank_scored_ngrams = biophrases.graphs.bigrams_from_best(tokenized_sentences, 'pagerank', t=3)
        pagerank_scored_ngrams = list(map(lambda x: (' '.join(x[0]), x[1]), pagerank_scored_ngrams))
        
        t2 = time.time()
        print('it took: {} s'.format(t2 - t1))
        return pagerank_scored_ngrams

    elif measure is 'tfidf':
        sentences_strings = list(map(lambda x: ' '.join(x), tokenized_sentences))
        tfidf_scored_ngrams = biophrases.tfidf.tf_idf(sentences_strings, n=2)  # `n` determines length of ngrams
        
        t2 = time.time()
        print('it took: {} s'.format(t2 - t1))
        return tfidf_scored_ngrams

    elif measure in ['closed_ngram_tilda', 'closed_tilda_ngram',
                     'variational_ngram_tilda', 'variational_tilda_ngram']:
        
        dictionary, reversed_dictionary = make_dict(tokenized_sentences, 3, threshold=1)
        
        sentences_unked = list(map(lambda x: replace_unk_list(x, dictionary), tokenized_sentences))
        sentences_unked = list(filter(lambda x: len(x) > 0, sentences_unked))
        sentences_unked = list(filter(lambda x: len(x.split()) > 1, sentences_unked))
        
        vocab_gauss = Vocabulary(dictionary)

        embed = GaussianEmbedding(len(dictionary), 100,
                                  covariance_type='diagonal',
                                  energy_type='KL')

        dict_ngram_id_by_tuple, dict_tilda_id_by_tuple = make_tupled_dicts(dictionary)

        # TODO: Rewrite more wisely (i forgot how to wrap correctly such functions)!
        def ngram_id_wrap(x):
            return ngram_id(x, dict_ngram_id_by_tuple)

        def tilda_id_wrap(x):
            return tilda_id(x, dict_tilda_id_by_tuple)

        embed.train(
        iter_pairs(sentences_unked, vocab_gauss, ngram_id_wrap,
                   tilda_id_wrap, batch_size=10, nsamples=2,
                   window=5, n=3),
        n_workers=8)

        if measure is 'closed_ngram_tilda':
            t2 = time.time()
            print('it took: {} s'.format(t2 - t1))
            return score_ngrams(dictionary, embed, 0, 0)

        elif measure is 'closed_tilda_ngram':
            t2 = time.time()
            print('it took: {} s'.format(t2 - t1))
            return score_ngrams(dictionary, embed, 0, 1)

        elif measure is 'variational_ngram_tilda':
            t2 = time.time()
            print('it took: {} s'.format(t2 - t1))
            return score_ngrams(dictionary, embed, 1, 0)

        elif measure is 'variational_tilda_ngram':
            t2 = time.time()
            print('it took: {} s'.format(t2 - t1))
            return score_ngrams(dictionary, embed, 1, 1)
    else:
        print("Wrong 'measure'")
        return -1

### Here goes KL-divergence.

In [126]:
from infgain.metrics import score_ngrams

closed_kl_ngram_token = score_ngrams(dictionary, embed, 0, 0)
closed_kl_token_ngram = score_ngrams(dictionary, embed, 0, 1)

variational_kl_ngram_token = score_ngrams(dictionary, embed, 1, 0)
variational_kl_token_ngram = score_ngrams(dictionary, embed, 1, 1)

In [127]:
import pandas as pd

In [158]:
def firstonly(tuples):
    return list(map(lambda x: x[0], tuples))

In [159]:
# topn = 20
# data = {'Closed KL(ngram, tilda)': firstonly(closed_kl_ngram_token[:topn]),
#         'Closed KL(tilda, ngram)': firstonly(closed_kl_token_ngram[:topn]),
#         'Variational KL(ngram, tilda)': firstonly(variational_kl_ngram_token[:topn]),
#         'Variational KL(tilda, ngram)': firstonly(variational_kl_token_ngram[:topn])}

topn = 20
data = {'TF-IDF':                       firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'tfidf')[:topn]),
        'PageRank':                     firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'pagerank')[:topn]),
        
        'Closed KL(ngram, tilda)':      firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'closed_ngram_tilda')[:topn]),
        'Closed KL(tilda, ngram)':      firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'closed_tilda_ngram')[:topn]),

        'Variational KL(ngram, tilda)': firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'variational_ngram_tilda')[:topn]),
        'Variational KL(tilda, ngram)': firstonly(get_scored_ngrams(sentences_tokenized[:1000], 'variational_tilda_ngram')[:topn])}

Starting... tfidf
it took: 0.09041309356689453 s
Starting... pagerank
it took: 2.8091390132904053 s
Starting... closed_ngram_tilda
it took: 16.634124040603638 s
Starting... closed_tilda_ngram
it took: 16.451080083847046 s
Starting... variational_ngram_tilda
it took: 16.524410009384155 s
Starting... variational_tilda_ngram
it took: 16.13958501815796 s


In [161]:
pd.DataFrame(data)

Unnamed: 0,"Closed KL(ngram, tilda)","Closed KL(tilda, ngram)",PageRank,TF-IDF,"Variational KL(ngram, tilda)","Variational KL(tilda, ngram)"
0,ovarian_venous,ovarian_venous,NUMBER patients,number number,radix_paeoniae,interfering_rna
1,mammalian_cells,mannose_NUMBER_phosphate,cremoris sk110,il number,industrial_interest,radix_paeoniae
2,mimics_inhibitors,mammalian_cells,mir NUMBER,mir number,uplc_ms,repeat_alleles
3,atypical_bacterial,kappa_b,NUMBER respectively,number patients,ras_effector,necrosis_factor
4,oral_epithelium,mitochondrial_proteomics,brucella spp,dif number,atp_hydrolysis,sister_chromatids
5,yielding_journals,immediately_following,lactis ssp,number il,body_collection,myb_transcription
6,increased_number,individual_differences,NUMBER nests,cremoris sk110,signal_transduction,body_collection
7,red_hair,sex_determination,calyx persistence,bodipy dif,cadherins_me1,ras_effector
8,staphylococcus_aureus_mrsa,cy5_NUMBER,radix paeoniae,number mg,oral_epithelium,coagulation_function
9,immediately_following,mimics_inhibitors,bodipy dif,patients number,indirect_immunofluorescence,target_sites
