In [1]:
import collections

import pandas as pd
import nltk
import gensim

import kgglcncr.data_import as data_import
import kgglcncr.preprocessing as preprocessing
import kgglcncr.features as features

In [2]:
training_text = data_import.import_training_text()

In [3]:
training_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
#training_text = preprocessing.remove_citations(training_text)
#training_text = preprocessing.lower_case(training_text)

preprocessor = preprocessing.PreprocessingPipeline([preprocessing.remove_citations,
                                                    preprocessing.lower_case])

training_text = preprocessor(training_text)

In [5]:
corpus = training_text.loc[:, 'Text']

In [6]:
corpus = corpus.tolist()

In [7]:
corpus = [nltk.word_tokenize(doc) for doc in corpus]

In [8]:
freq = collections.defaultdict(int)
for doc in corpus:
    for token in doc:
        freq[token] += 1

In [9]:
len(freq)

280156

In [10]:
corpus = [[token for token in doc if freq[token]>1] for doc in corpus]

In [11]:
dictionary = gensim.corpora.Dictionary(corpus)

In [12]:
print(dictionary)

Dictionary(172911 unique tokens: ['“biotag”', 'llanos', 'il-7r–mediated', 'known.the', 'reticulocytosis']...)


In [13]:
len(dictionary)

172911

In [14]:
corpus = [dictionary.doc2bow(doc) for doc in corpus]

In [15]:
tfidf_model, tfidf_corpus = features.gensim_tfidf_vectorizer(corpus, dictionary)

In [16]:
tfidf_corpus

<gensim.interfaces.TransformedCorpus at 0x7f7cc73ee2b0>

In [17]:
tfidf_model, tfidf_matrix = features.gensim_tfidf_vectorizer(corpus, dictionary, as_array=True) # support sparsity

In [18]:
tfidf_matrix.shape

(3321, 172911)

In [19]:
tfidf_matrix

array([[  1.39139302e-03,   5.95079036e-04,   8.59215762e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.78633397e-03,   2.27473618e-04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.78633397e-03,   2.27473618e-04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  6.17478625e-04,   4.16979219e-06,   0.00000000e+00, ...,
          0.00000000e+00,   6.40204595e-03,   0.00000000e+00],
       [  6.02177868e-04,   1.08439133e-04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.10555105e-04,   7.20837415e-05,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]], dtype=float32)

In [20]:
lsi_model, lsi_matrix = features.gensim_lsi_vectorizer(corpus, dictionary, as_array=True)

In [21]:
lsi_matrix.shape

(3321, 100)

In [22]:
lsi_matrix

array([[ 0.01375046, -0.01540981,  0.02546145, ..., -0.00277578,
        -0.00228842,  0.00232185],
       [ 0.0183161 , -0.02257949,  0.08638845, ..., -0.0420027 ,
         0.00659801,  0.03181852],
       [ 0.0183161 , -0.02257949,  0.08638845, ..., -0.0420027 ,
         0.00659801,  0.03181852],
       ..., 
       [ 0.00964005, -0.01002534,  0.02171246, ..., -0.06164069,
        -0.01404309,  0.00471707],
       [ 0.02482119, -0.02282849,  0.04993295, ..., -0.11377372,
        -0.04517514, -0.0059363 ],
       [ 0.02428523, -0.02016754,  0.04182753, ..., -0.12960379,
        -0.05666821, -0.0068714 ]], dtype=float32)

In [23]:
lsi_model.print_topic(0,50)

'0.645*"brca1" + 0.288*"brct" + 0.272*"vus" + 0.205*"variants" + 0.150*"deleterious" + 0.145*"brca2" + 0.133*"neutral" + 0.113*"⇓" + 0.111*"nih-pa" + 0.099*"brca" + 0.095*"manuscript" + 0.088*"vuss" + 0.084*"classifi" + 0.079*"odds" + 0.078*"author" + 0.067*".00" + 0.067*"variant" + 0.065*"phosphopeptide" + 0.062*"e2" + 0.060*"m1775r" + 0.055*"causality" + 0.054*"transcriptional" + 0.054*"classification" + 0.052*"rmce" + 0.052*"ed" + 0.052*"117" + 0.051*"bard1" + 0.050*"cisplatin" + 0.046*"risk" + 0.046*"ovarian" + 0.043*"iarc" + 0.042*"cation" + 0.041*"history" + 0.041*"pathogenic" + 0.040*"cient" + 0.040*"breast" + 0.038*"proteolysis" + 0.037*"pmc" + 0.037*"hdr" + 0.037*"loh" + 0.036*"ssa" + 0.036*"classified" + 0.036*"supplementary" + 0.036*"hr" + 0.036*"missense" + 0.035*"0" + 0.035*"yeast" + 0.035*"likelihood" + 0.035*"unclassified" + 0.034*">"'