In [5]:
import os
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import TfidfModel

import numpy as np

In [2]:
def filter_sentences_from_text(text_lines, min_line_len=20, min_sen_len=15):
    fulltext = " ".join([line for line in list(text_lines) if len([l for l in line.split(" ") if len(l) > 0]) > min_line_len])
    sens = filter(lambda sen: len(sen) >= min_sen_len, fulltext.split("."))
    return ". ".join(sens)

In [1]:
def get_texts_from_dir(texts_dir):
    txt_files = os.listdir(texts_dir)
    txt_files = [os.path.join(texts_dir, txt) for txt in txt_files]
    texts = dict()
    for txt_f in list(filter(lambda path: path.endswith(".txt"), txt_files)):
        try:
            text = filter_sentences_from_text(open(txt_f, "r").readlines())
            texts[txt_f] = preprocess_string(text)
        except UnicodeDecodeError:
            print("Utf-8 decode error on %s" % txt_f)
            continue
    return texts


In [7]:
texts_preproc = get_texts_from_dir("/data/gensim_citations_bkp/merged")
texts = texts = [t for t in texts_preproc.values() if len(t) > 0]
texts_links = texts_preproc.keys()

In [8]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = TfidfModel(corpus)

In [37]:
lda = LdaModel([[(x[0], x[1]/2) for x in doc_corpus] for doc_corpus in corpus], 
         num_topics=20, alpha='auto', eval_every=5)

In [None]:
lda

In [38]:
def topic_distro_for_text(text):
    return lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0)

In [24]:
def terms_for_doc(doc_id):
    return [(dictionary.get(tfidf_tuple[0]), tfidf_tuple[1]) for tfidf_tuple in tfidf[corpus[doc_id]]]

In [27]:
def top_terms_for_doc(doc_id, percentile):
    doc_terms_ordered = sorted(terms_for_doc(doc_id), key=lambda term: term[1],  reverse=True)
    return [term[0] for term in doc_terms_ordered[:int(len(doc_terms_ordered)*percentile)]]

In [29]:
top_terms_for_doc(0, percentile=0.5)

['chang',
 'diachron',
 'effect',
 'prototyp',
 'likelihood',
 'verb',
 'noun',
 '\ufeffto',
 'semant',
 'mutabl',
 'independ',
 'word',
 'correl',
 'identifi',
 'causal',
 'synchron',
 'psycholinguist',
 'button',
 'class',
 'factor',
 'signific',
 'complement',
 'mere',
 'navig',
 'adject',
 'explan',
 'varianc',
 'fashion',
 'invers',
 'quantifi',
 'regress',
 'recurr',
 'click',
 'number',
 'token',
 'lexicon',
 'propos',
 'cover',
 'profil']

In [31]:
dictionary.doc2bow(texts[0])

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 8),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 2),
 (17, 2),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 3),
 (22, 1),
 (23, 2),
 (24, 5),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 2),
 (31, 1),
 (32, 1),
 (33, 3),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 2),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 2),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 2),
 (54, 2),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 5),
 (64, 1),
 (65, 2),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 2),
 (73, 1),
 (74, 1),
 (75, 2),
 (76, 6),
 (77, 1)]

In [30]:
terms_for_doc(0)

[('adject', 0.09344360456582708),
 ('analysi', 0.02851096836158827),
 ('approach', 0.02717207327583276),
 ('assign', 0.06262975130624464),
 ('base', 0.022358188818841874),
 ('better', 0.04883172353650697),
 ('button', 0.11210965283447827),
 ('causal', 0.12374860032798168),
 ('chang', 0.4263888832753634),
 ('class', 0.11143296654456712),
 ('click', 0.07556100209969478),
 ('cluster', 0.04765787692078198),
 ('complement', 0.10422361531271052),
 ('complet', 0.06023140803970311),
 ('consent', 0.04588830094737287),
 ('contribut', 0.05508952505442333),
 ('cooki', 0.044223567067140424),
 ('correl', 0.1375865905197532),
 ('cover', 0.0722542938728129),
 ('data', 0.02099324057250092),
 ('degre', 0.0687932952598766),
 ('diachron', 0.40919533313685524),
 ('differ', 0.034329537636779185),
 ('distribut', 0.06786237626523418),
 ('effect', 0.21344281382618024),
 ('entir', 0.06575722936458926),
 ('establish', 0.06989407767593134),
 ('examin', 0.06348375065341606),
 ('experi', 0.026873113586261408),
 ('e

In [48]:
def tfidf_to_corpus(tfidf, corpus):
    return [tfidf[corpus[doc_i]] for doc_i in range(len(corpus))]

In [49]:
tfidf_corpus = tfidf_to_corpus(tfidf, corpus)

In [50]:
lda_tfidf = LdaModel(tfidf_corpus, num_topics=20, alpha='auto', eval_every=5)