In [2]:
import collections

import nltk
import gensim

import kgglcncr.data_import as data_import
import kgglcncr.preprocessing as preprocessing

In [3]:
training_text = data_import.import_training_text()

In [4]:
training_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung cance...
2,2,Abstract Background Non-small cell lung cance...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [5]:
#training_text = preprocessing.remove_citations(training_text)
#training_text = preprocessing.lower_case(training_text)

preprocessor = preprocessing.PreprocessingPipeline([preprocessing.remove_citations,
                                                    preprocessing.lower_case])

training_text = preprocessor(training_text)

In [6]:
def bag_of_words(text_data):
    return preprocessing.text_data_preprocessor(nltk.word_tokenize, text_data)

In [7]:
test = bag_of_words(training_text)

In [8]:
test.head()

Unnamed: 0,ID,Text
0,0,"[cyclin-dependent, kinases, cdks, regulate, a,..."
1,1,"[abstract, background, non-small, cell, lung, ..."
2,2,"[abstract, background, non-small, cell, lung, ..."
3,3,"[recent, evidence, has, demonstrated, that, ac..."
4,4,"[oncogenic, mutations, in, the, monomeric, cas..."


In [9]:
freq = collections.defaultdict(int)
for i in range(len(test)):
     for token in test.loc[i, 'Text']:
        freq[token] += 1

In [10]:
freq

defaultdict(int,
            {'nb1269': 1,
             'hnf-1β': 7,
             'd641g': 18,
             'chester': 16,
             'glu/asp': 3,
             'venteicher': 1,
             'sf3b1+/': 5,
             'd5-phosphorylated': 1,
             'nb-': 4,
             '390–': 3,
             'under-wound': 1,
             'insnpg': 65,
             'c-raf-dependence': 1,
             'gly171asp': 4,
             'p654l': 23,
             'scavenging': 3,
             '⩾100': 64,
             'derivativeof': 1,
             'prl-sv40': 24,
             '93175': 1,
             'd1-interacting': 1,
             'lys2797argfsx26': 1,
             'hy-line': 1,
             'uiccstage': 1,
             'cuny': 1,
             'fgfr2–tacc3': 125,
             'kex16bex11': 3,
             'bm': 1277,
             'glu1077': 3,
             '-1h-pyrazolo': 2,
             'osteosarcomas': 50,
             'machinery—particularly': 2,
             'self-availability': 1,
          

In [11]:
len(freq)

248762

In [12]:
def _word_frequency_cutoff(bag_of_words):
    return [token for token in bag_of_words if freq[token] > 1]
def word_frequency_cutoff(text_data):
    return preprocessing.text_data_preprocessor(_word_frequency_cutoff, text_data)

In [13]:
bow = word_frequency_cutoff(test)

In [14]:
bow.head()

Unnamed: 0,ID,Text
0,0,"[cyclin-dependent, kinases, cdks, regulate, a,..."
1,1,"[abstract, background, non-small, cell, lung, ..."
2,2,"[abstract, background, non-small, cell, lung, ..."
3,3,"[recent, evidence, has, demonstrated, that, ac..."
4,4,"[oncogenic, mutations, in, the, monomeric, cas..."


In [15]:
dictionary = gensim.corpora.Dictionary(bow.loc[:,'Text'])

In [16]:
print(dictionary)

Dictionary(157719 unique tokens: ['us/intergroup', 'd38n', 'fashion18', 'hnf-1β', 'd641g']...)


In [17]:
def count_feature(text_data, dictionary):
    return preprocessing.text_data_preprocessor(dictionary.doc2bow, text_data)

In [18]:
fv = count_feature(bow, dictionary)

In [19]:
fv.head()

Unnamed: 0,ID,Text
0,0,"[(0, 2), (1, 3), (2, 14), (3, 1), (4, 2), (5, ..."
1,1,"[(0, 7), (2, 1), (4, 1), (8, 1), (9, 2), (11, ..."
2,2,"[(0, 7), (2, 1), (4, 1), (8, 1), (9, 2), (11, ..."
3,3,"[(0, 1), (2, 4), (5, 1), (6, 1), (8, 1), (11, ..."
4,4,"[(0, 4), (2, 6), (6, 1), (9, 1), (11, 336), (1..."


In [20]:
tfidf = gensim.models.TfidfModel(fv.loc[:,'Text'])

In [21]:
tfidf = tfidf[fv.loc[:,'Text']]

In [22]:
tfidf

<gensim.interfaces.TransformedCorpus at 0x7ff2797c1978>

In [23]:
lsi = gensim.models.LsiModel(tfidf, id2word=dictionary, num_topics=100)

In [24]:
corpus_lsi = lsi[tfidf]

In [39]:
lsi.print_topic(0,100)

'0.654*"brca1" + 0.306*"brct" + 0.258*"vus" + 0.200*"variants" + 0.129*"neutral" + 0.123*"⇓" + 0.119*"deleterious" + 0.118*"nih-pa" + 0.114*"brca2" + 0.110*"−2" + 0.101*"manuscript" + 0.089*"classifi" + 0.083*"author" + 0.072*"−1" + 0.071*"phosphopeptide" + 0.068*"brca" + 0.067*"e2" + 0.066*"m1775r" + 0.063*"variant" + 0.059*"transcriptional" + 0.058*"vuss" + 0.057*"−3" + 0.055*"bard1" + 0.055*"rmce" + 0.055*"ed" + 0.054*"odds" + 0.053*"117" + 0.052*"cisplatin" + 0.052*"classification" + 0.045*"risk" + 0.044*"cation" + 0.043*"iarc" + 0.042*"cient" + 0.042*"proteolysis" + 0.042*"ovarian" + 0.041*"breast" + 0.040*"pathogenic" + 0.040*"pmc" + 0.040*"causality" + 0.039*"hdr" + 0.039*"yeast" + 0.039*"supplementary" + 0.039*"ssa" + 0.037*"hr" + 0.036*"pten" + 0.036*"missense" + 0.036*"loh" + 0.035*"stem" + 0.034*"neighborhood" + 0.034*"−" + 0.033*"peptide" + 0.033*">" + 0.033*"classified" + 0.032*"unclassified" + 0.031*"history" + 0.031*"res" + 0.031*"embryonic" + 0.031*"a1708e" + 0.031*"rin

In [26]:
lsi.print_topic(99)

'-0.270*"epas1" + -0.233*"jak2" + 0.211*"jak1" + -0.206*"hif2α" + 0.196*"sox9" + -0.172*"mkk4" + -0.140*"elf3" + 0.126*"tric" + -0.119*"crenolanib" + 0.109*"ezh2"'

In [27]:
lsi2 =  gensim.models.LsiModel(tfidf, id2word=dictionary, num_topics=10)

In [40]:
lsi2.print_topics(num_words=30)

[(0,
  '-0.654*"brca1" + -0.306*"brct" + -0.258*"vus" + -0.200*"variants" + -0.129*"neutral" + -0.123*"⇓" + -0.119*"deleterious" + -0.118*"nih-pa" + -0.114*"brca2" + -0.110*"−2" + -0.101*"manuscript" + -0.089*"classifi" + -0.083*"author" + -0.072*"−1" + -0.071*"phosphopeptide" + -0.068*"brca" + -0.067*"e2" + -0.066*"m1775r" + -0.063*"variant" + -0.059*"transcriptional" + -0.058*"vuss" + -0.057*"−3" + -0.055*"bard1" + -0.055*"rmce" + -0.055*"ed" + -0.054*"odds" + -0.053*"117" + -0.052*"cisplatin" + -0.052*"classification" + -0.045*"risk"'),
 (1,
  '-0.778*"pten" + -0.208*"phts" + -0.184*"pip3" + -0.150*"phosphatase" + -0.150*"asd/dd" + -0.100*"asp92" + -0.092*"tumor-related" + -0.086*"egfr" + -0.085*"catalytic" + 0.077*"brca1" + -0.076*"yeast" + -0.065*"ptdins" + -0.064*"p-loop" + -0.063*"p110α-caax" + -0.063*"pten-like" + -0.061*"p3" + -0.059*"p4" + -0.056*"loops" + -0.050*"6-pten" + -0.050*"h93r" + -0.050*"ins" + -0.048*"gefitinib" + -0.047*"lipid" + -0.047*"pi3k" + -0.046*"autism" + 

In [30]:
help(gensim.models.LsiModel)

Help on class LsiModel in module gensim.models.lsimodel:

class LsiModel(gensim.interfaces.TransformationABC, gensim.models.basemodel.BaseTopicModel)
 |  Objects of this class allow building and maintaining a model for Latent
 |  Semantic Indexing (also known as Latent Semantic Analysis).
 |  
 |  The main methods are:
 |  
 |  1. constructor, which initializes the projection into latent topics space,
 |  2. the ``[]`` method, which returns representation of any input document in the
 |     latent space,
 |  3. `add_documents()` for incrementally updating the model with new documents.
 |  
 |  The left singular vectors are stored in `lsi.projection.u`, singular values
 |  in `lsi.projection.s`. Right singular vectors can be reconstructed from the output
 |  of `lsi[training_corpus]`, if needed. See also FAQ [2]_.
 |  
 |  Model persistency is achieved via its load/save methods.
 |  
 |  .. [2] https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q4-how-do-you-output-the-u-s-vt-matric

In [31]:
corpus_lsi2 = lsi2[tfidf]

In [32]:
for i, doc in enumerate(corpus_lsi2):
    print(doc)
    if i >= 9: break

[(0, -0.013810624651677433), (1, -0.014381380845581099), (2, -0.023466855270434073), (3, -0.0032649395288155804), (4, 0.026375481836280371), (5, 0.0097529015772247433), (6, -0.0041502781020606762), (7, -0.0036633208709689231), (8, 0.024745103847704782), (9, 0.0032320827214444635)]
[(0, -0.018763426107994727), (1, -0.021670110775621341), (2, -0.086748705074664631), (3, -0.0022965198652322239), (4, -0.019916594462958966), (5, -0.0010333695079973301), (6, -0.010291101930211049), (7, -0.0033087173866814122), (8, 0.018762380075057643), (9, 0.00092582382661085148)]
[(0, -0.018763426107994727), (1, -0.021670110775621341), (2, -0.086748705074664631), (3, -0.0022965198652322239), (4, -0.019916594462958966), (5, -0.0010333695079973301), (6, -0.010291101930211049), (7, -0.0033087173866814122), (8, 0.018762380075057643), (9, 0.00092582382661085148)]
[(0, -0.02626840437841434), (1, -0.019399086741583669), (2, -0.057293268413171498), (3, -0.0015597029170754328), (4, 0.036643312616081093), (5, 0.0179

In [33]:
len(corpus_lsi2)

3321

In [34]:
X = gensim.matutils.corpus2dense(corpus_lsi2, num_terms=10, num_docs=len(corpus_lsi2))

In [35]:
X.shape

(10, 3321)

In [36]:
X

array([[-0.01381062, -0.01876343, -0.01876343, ..., -0.00933241,
        -0.02392174, -0.02397908],
       [-0.01438138, -0.02167011, -0.02167011, ..., -0.00928216,
        -0.01902175, -0.01797347],
       [-0.02346686, -0.0867487 , -0.0867487 , ..., -0.01932537,
        -0.0404932 , -0.03679732],
       ..., 
       [-0.00366332, -0.00330872, -0.00330872, ..., -0.00203307,
        -0.00257692, -0.00132331],
       [ 0.0247451 ,  0.01876238,  0.01876238, ...,  0.01016099,
         0.03106793,  0.02933426],
       [ 0.00323208,  0.00092582,  0.00092582, ..., -0.00075495,
        -0.00358354, -0.00377633]], dtype=float32)

In [41]:
X=X.T

In [42]:
X

array([[-0.01381062, -0.01438138, -0.02346686, ..., -0.00366332,
         0.0247451 ,  0.00323208],
       [-0.01876343, -0.02167011, -0.0867487 , ..., -0.00330872,
         0.01876238,  0.00092582],
       [-0.01876343, -0.02167011, -0.0867487 , ..., -0.00330872,
         0.01876238,  0.00092582],
       ..., 
       [-0.00933241, -0.00928216, -0.01932537, ..., -0.00203307,
         0.01016099, -0.00075495],
       [-0.02392174, -0.01902175, -0.0404932 , ..., -0.00257692,
         0.03106793, -0.00358354],
       [-0.02397908, -0.01797347, -0.03679732, ..., -0.00132331,
         0.02933426, -0.00377633]], dtype=float32)