## Tool functions

In [1]:
import os
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import TfidfModel

import numpy as np

In [2]:
def filter_sentences_from_text(text_lines, min_line_len=20, min_sen_len=15):
    fulltext = " ".join([line for line in list(text_lines) if len([l for l in line.split(" ") if len(l) > 0]) > min_line_len])
    sens = filter(lambda sen: len(sen) >= min_sen_len, fulltext.split("."))
    return ". ".join(sens)

In [3]:
def get_texts_from_dir(texts_dir):
    txt_files = os.listdir(texts_dir)
    txt_files = [os.path.join(texts_dir, txt) for txt in txt_files]
    texts = dict()
    for txt_f in list(filter(lambda path: path.endswith(".txt"), txt_files)):
        try:
            text = filter_sentences_from_text(open(txt_f, "r").readlines())
            texts[txt_f] = preprocess_string(text)
        except UnicodeDecodeError:
            print("Utf-8 decode error on %s" % txt_f)
            continue
    return texts


In [4]:
def read_texts(texts_dir):
    txt_files = os.listdir(texts_dir)
    txt_files = [os.path.join(texts_dir, txt) for txt in txt_files]
    for txt_f in list(filter(lambda path: path.endswith(".txt"), txt_files)):
        yield open(txt_f, "r").readlines()

In [5]:
# TODO: fill
texts = read_texts("/data/txt/directory")
text = list(texts)[4]
text

['\ufeffSkip to main content \n',
 ' \n',
 'Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles and JavaScript.\n',
 '    1. nature\n',
 '    2. translational psychiatry\n',
 '    3. original article\n',
 '    4. article\n',
 'Menu \n',
 '\n',
 'Search E-alert Submit My Account Login \n',
 '    • PDF\n',
 'Polygenic loading for major depression is associated with specific medical comorbidity\n',
 'Close menu\n',
 'Search\n',
 'Search nature.com  \n',
 'advanced \n',
 'Search \n',
 'Close menuClose menu\n',
 'Menu\n',
 'Translational Psychiatry \n',
 '    • Browse Articles \n',
 '    • Browse Collections \n',
 '    • Focuses \n',
 '    • About the Journal \n',
 '    • Open Access \n',
 '    • About the Editors \n',
 '    • Fo

In [6]:
text_f = filter_sentences_from_text(text)
text_f.split("\n")

['Thank you for visiting nature.  You are using a browser version with limited support for CSS.  To obtain the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in Internet Explorer).  In the meantime, to ensure continued support, we are displaying the site without styles and JavaScript. ',
 " Major depressive disorder frequently co-occurs with medical disorders, raising the possibility of shared genetic liability.  Recent identification of 15 novel genetic loci associated with depression allows direct investigation of this question.  In cohorts of individuals participating in biobanks at two academic medical centers, we calculated polygenic loading for risk loci reported to be associated with depression.  We then examined the association between such loading and 50 groups of clinical diagnoses, or topics, drawn from these patients' electronic health records, determined using a novel application of latent Dirichilet allocation.  Three topic

In [7]:
texts_preproc = get_texts_from_dir("/data/gensim_citations_bkp/merged")

In [8]:
texts = [t for t in texts_preproc.values() if len(t) > 0]
texts_links = texts_preproc.keys()

In [37]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = None


def lda_from_texts(corpus, num_topics=20):
    lda = LdaModel(corpus, num_topics=num_topics, alpha='auto', eval_every=5)
    return lda

In [38]:
texts.__len__()

740

In [39]:
def topic_distro_for_text(text):
    return lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0)

In [40]:
def terms_for_topic(topic_id, top_terms=10):
    topic_top_terms = lda.get_topic_terms(topic_id, topn=top_terms)
    return [dictionary.get(term[0]) for term in topic_top_terms]

### tf-idf integration

Two approaches:

1. use idf weights instead of BoW frequencies
2. filter out given percentile of least important words from Docs' representation
3. combined

### 1. approach

In [None]:
tfidf = TfidfModel(corpus)

In [49]:
def terms_for_doc(doc_id):
    return tfidf[corpus[doc_id]]

In [98]:
def top_terms_idf_for_doc(doc_id, percentile):
    doc_terms_ordered = sorted(terms_for_doc(doc_id), key=lambda term: term[1],  reverse=True)
    return [term[0] for term in doc_terms_ordered[:int(len(doc_terms_ordered)*percentile)]]

In [135]:
def name_terms_in_tuples(doc_corpus):
    return [(dictionary.get(tup[0]), tup[1]) for tup in doc_corpus]

In [149]:
def top_terms_idf_for_doc(doc_id, percentile):
    doc_terms_ordered = sorted(terms_for_doc(doc_id), key=lambda term: term[1], reverse=True)
    return [(doc_terms_ordered[i][0], doc_terms_ordered[i][1]) 
            for i in range(int(len(doc_terms_ordered)*percentile))]

In [63]:
def term_tfidf_for_doc(doc_idx):
    return [(tfidf_term_tuple[0], tfidf_term_tuple[1]) 
            for tfidf_term_tuple in tfidf_corpus[doc_idx]]

In [161]:
tfidf_corpus1 = [top_terms_idf_for_doc(doc_i, 0.5) for doc_i in range(len(texts))]

In [162]:
tfidf_corpus1[0]

[(8, 0.4263888832753634),
 (21, 0.40919533313685524),
 (24, 0.21344281382618024),
 (54, 0.18688720913165416),
 (39, 0.17642169230132967),
 (75, 0.17191612815653237),
 (46, 0.16980827584756597),
 (77, 0.16360395928964705),
 (63, 0.16323908336465176),
 (44, 0.14643919047125745),
 (34, 0.13978815535186268),
 (76, 0.13918626482458266),
 (17, 0.1375865905197532),
 (33, 0.1315216872647399),
 (7, 0.12374860032798168),
 (68, 0.12374860032798168),
 (55, 0.11923367556056216),
 (6, 0.11210965283447827),
 (9, 0.11143296654456712),
 (30, 0.11143296654456712),
 (65, 0.10835576248491846),
 (12, 0.10422361531271052),
 (42, 0.10422361531271052),
 (45, 0.10422361531271052),
 (0, 0.09344360456582708),
 (29, 0.09202816064986685),
 (74, 0.09068926556411135),
 (31, 0.0894190626912025),
 (35, 0.08198741455756116),
 (56, 0.08198741455756116),
 (59, 0.08108682342458094),
 (58, 0.08021783858415205),
 (10, 0.07556100209969478),
 (47, 0.07462656012219744),
 (70, 0.07418489659844453),
 (38, 0.07352449674572176),
 

### 2. approach:

In [171]:
terms_for_doc(0)

[(0, 0.09344360456582708),
 (1, 0.02851096836158827),
 (2, 0.02717207327583276),
 (3, 0.06262975130624464),
 (4, 0.022358188818841874),
 (5, 0.04883172353650697),
 (6, 0.11210965283447827),
 (7, 0.12374860032798168),
 (8, 0.4263888832753634),
 (9, 0.11143296654456712),
 (10, 0.07556100209969478),
 (11, 0.04765787692078198),
 (12, 0.10422361531271052),
 (13, 0.06023140803970311),
 (14, 0.04588830094737287),
 (15, 0.05508952505442333),
 (16, 0.044223567067140424),
 (17, 0.1375865905197532),
 (18, 0.0722542938728129),
 (19, 0.02099324057250092),
 (20, 0.0687932952598766),
 (21, 0.40919533313685524),
 (22, 0.034329537636779185),
 (23, 0.06786237626523418),
 (24, 0.21344281382618024),
 (25, 0.06575722936458926),
 (26, 0.06989407767593134),
 (27, 0.06348375065341606),
 (28, 0.026873113586261408),
 (29, 0.09202816064986685),
 (30, 0.11143296654456712),
 (31, 0.0894190626912025),
 (32, 0.0587525491675709),
 (33, 0.1315216872647399),
 (34, 0.13978815535186268),
 (35, 0.08198741455756116),
 (36,

In [172]:
def top_terms_freqs_for_doc(doc_id, percentile):
    doc_terms_idf = dict(terms_for_doc(doc_id))
    doc_terms_count = corpus[doc_id]
    doc_terms_count_ordered = sorted(doc_terms_count, 
                                     key=lambda term_count: doc_terms_idf[term_count[0]], reverse=True)
    return [(doc_terms_count_ordered[i][0], doc_terms_count_ordered[i][1])
            for i in range(int(len(doc_terms_count_ordered)*percentile))]

In [173]:
tfidf_corpus2 = [top_terms_freqs_for_doc(doc_i, 0.5) for doc_i in range(len(texts))]
tfidf_corpus2[0]

[(8, 8),
 (21, 3),
 (24, 5),
 (54, 2),
 (39, 2),
 (75, 2),
 (46, 2),
 (77, 1),
 (63, 5),
 (44, 1),
 (34, 2),
 (76, 6),
 (17, 2),
 (33, 3),
 (7, 1),
 (68, 1),
 (55, 1),
 (6, 1),
 (9, 2),
 (30, 2),
 (65, 2),
 (12, 1),
 (42, 1),
 (45, 1),
 (0, 1),
 (29, 1),
 (74, 1),
 (31, 1),
 (35, 1),
 (56, 1),
 (59, 1),
 (58, 1),
 (10, 1),
 (47, 2),
 (70, 1),
 (38, 1),
 (53, 2),
 (18, 1),
 (51, 1)]

### LDA computation

In [229]:
lda_num_topics = 400
lda = lda_from_texts(tfidf_corpus1, num_topics=lda_num_topics)


overflow encountered in exp2



### follows visualization part - inherited from other project

In [230]:
# [terms_for_topic(i, top_terms=20) for i in (1, 4, 16, 20)]

In [231]:
topic_distros = np.array([topic_distro_for_text(text) for text in texts])

In [232]:
topic_distros = topic_distros[:, :, 1]

In [233]:
base_topic_docs_distros = np.identity(lda_num_topics)

In [234]:
topic_distros_based = np.append(topic_distros, base_topic_docs_distros) \
    .reshape((len(topic_distros)+lda_num_topics, lda_num_topics))

In [235]:
topic_distros_based.shape

(1140, 400)

# Get a projection of topic according to their relative similarity

Relative similarity is a correlation of documents' belonging to it

# Get a projection of docs so that the docs are close to their major topic

Doc distance to a cluster must be proportional to it's probability of belonging to it

In [236]:
from scipy.spatial.distance import pdist, squareform

dists = squareform(pdist(topic_distros_based, metric="correlation"))
dists

array([[0.        , 0.94021965, 0.91278131, ..., 1.00638188, 1.00638193,
        1.00638188],
       [0.94021965, 0.        , 0.97272928, ..., 1.00847029, 1.00847033,
        1.00847029],
       [0.91278131, 0.97272928, 0.        , ..., 1.00876573, 1.00876599,
        1.00876572],
       ...,
       [1.00638188, 1.00847029, 1.00876573, ..., 0.        , 1.00250627,
        1.00250627],
       [1.00638193, 1.00847033, 1.00876599, ..., 1.00250627, 0.        ,
        1.00250627],
       [1.00638188, 1.00847029, 1.00876572, ..., 1.00250627, 1.00250627,
        0.        ]])

In [237]:
dists.shape

(1140, 1140)

In [238]:
from sklearn import manifold

adist = dists

amax = np.amax(adist)
adist /= amax

mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
results = mds.fit(adist)

coords = results.embedding_


In [185]:
import plotly
plotly.tools.set_credentials_file(username='stmichal', api_key='OXox9Rf8jzEHqUsNPqwn')

In [239]:
from functools import reduce

all_topics_w = [terms_for_topic(i, top_terms=100) for i in range(lda_num_topics)]
all_words = reduce(lambda x, y: set(x) | set(y), all_topics_w)
intersect_words = list(filter(lambda w: sum([w in t_words for t_words in all_topics_w]) > 5, all_words))
unique_topics_w = [[w for w in t_words if w not in intersect_words] for t_words in all_topics_w]

In [243]:
import plotly.plotly as py
import plotly.graph_objs as go

docs_len = len(topic_distros)

# Create a trace
trace_docs = go.Scatter(
    x = coords[:docs_len, 0],
    y = coords[:docs_len, 1],
    mode = 'markers',
    text = list(texts_links)
)
trace_bases = go.Scatter(
    x = coords[docs_len:, 0],
    y = coords[docs_len:, 1],
    mode = 'markers',
    marker = dict(color = 'rgba(255, 0, 122, .2)', size = 30),
    text = ["T %s: %s" % (i, unique_topics_w[i]) for i in range(lda_num_topics)]
)

data = [trace_docs, trace_bases]

label = 'MDS over LDA %s topics. tfidf for top 0.5 terms as frequencies' % lda.num_topics

# Plot and embed in ipython notebook!
layout = dict(title=label,  
              font=dict(size=12),
              showlegend=True,
              width=1000,
              height=1000,
              margin=dict(l=40, r=40, b=85, t=100),
              hovermode='closest',
              plot_bgcolor='rgb(256,256,256)'          
              )
py.iplot(dict(data=data, layout=layout), filename=label)