In [46]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import pandas as pd
from gensim import corpora, models, similarities

In [47]:
data_f = pd.read_pickle('articles.pkl')

In [48]:
docs = data_f['content']

In [49]:
#sent_tokenize tokenizes text by sentences
#word_tokenize tokenizes text by words

def tokenize_and_normalize(chunks):
    words = [tokenize.word_tokenize(sent) for sent in tokenize.sent_tokenize(chunks)]
    flatten = [inner for sublist in words for inner in sublist]
    stripped = []
    for word in flatten:
        if word not in stopwords.words('english'):
            try:
                stripped.append(word.encode('latin-1').decode('utf8').lower())
            except:
                pass
    output = [word for word in stripped if len(word)>1]
    return output

In [50]:
def print_features(clf, vocab, n=10):
    """Print sorted list of non-zero features/weights"""
    coef = clf.coef_[0]
    print 'positive features: %s' % (' '.join(['%s/%.2f'] % (vocab[j], coef[j])))
    print 'negative features: %s' % (' '.join(['%s/%.2f'] % (vocab[j], coef[j])))

In [51]:
parsed = [tokenize_and_normalize(s) for s in docs]

In [52]:
dictionary = corpora.Dictionary(parsed)

In [53]:
corpus = [dictionary.doc2bow(text) for text in parsed]

In [54]:
tfidf = models.TfidfModel(corpus)

In [55]:
corpus_tfidf = tfidf[corpus]

In [56]:
lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 15, update_every=0, passes=200)

In [57]:
lda.print_topics(15,15)

[u'0.001*maduro + 0.000*barker + 0.000*venezuela + 0.000*radionova + 0.000*ando + 0.000*lifter + 0.000*kerpelman + 0.000*hou + 0.000*cuccinelli + 0.000*doll + 0.000*usis + 0.000*wellness + 0.000*gang + 0.000*binz + 0.000*froman',
 u'0.002*yard + 0.001*inning + 0.001*touchdown + 0.001*museum + 0.001*exchange + 0.001*debt + 0.001*title + 0.001*pettitte + 0.001*dance + 0.001*clinton + 0.001*score + 0.001*bond + 0.001*pas + 0.001*festival + 0.001*rest',
 u'0.004*iran + 0.003*rouhani + 0.002*iranian + 0.001*israel + 0.001*netanyahu + 0.001*sanction + 0.001*israeli + 0.001*baghdad + 0.001*zarif + 0.001*saudi + 0.001*tehran + 0.001*palestinian + 0.001*holocaust + 0.001*diplomacy + 0.001*geneva',
 u'0.000*ukraine + 0.000*greenwood + 0.000*kurland + 0.000*fallon + 0.000*perez + 0.000*dirndl + 0.000*kidd + 0.000*shale + 0.000*dietzel + 0.000*budinger + 0.000*surrogacy + 0.000*glow + 0.000*gaming + 0.000*mumford + 0.000*bowen',
 u'0.001*coal + 0.000*carbon + 0.000*dioxide + 0.000*thornton + 0.000

In [62]:
lda.show_topics(num_topics=10,num_words=50,formatted=False)

[[(0.00064255940663292114, u'titan'),
  (0.00041572691152249062, u'steelers'),
  (0.00036958820944246778, u'boko'),
  (0.00036654901086019597, u'haram'),
  (0.00036244623523861813, u'webster'),
  (0.00034285185404297921, u'viking'),
  (0.00033044119156525558, u'islander'),
  (0.00031610139239395099, u'hockey'),
  (0.00031511412480852909, u'charger'),
  (0.00029532138550034082, u'udrih'),
  (0.00028532148388794127, u'izod'),
  (0.0002691675750457099, u'buccaneer'),
  (0.00026880915525230565, u'herzlich'),
  (0.00026071207659174994, u'gossip'),
  (0.00026051797727392919, u'libyan'),
  (0.00025689459451652409, u'burger'),
  (0.0002503564157934849, u'dixon'),
  (0.00024864192721655502, u'canuck'),
  (0.00024862150941381796, u'zorn'),
  (0.00023988332308900903, u'baylor'),
  (0.00022971117141300609, u'penney'),
  (0.00022933802449172315, u'ibrahim'),
  (0.00022826150495917734, u'fry'),
  (0.00022683685863529422, u'fitzpatrick'),
  (0.00022263581094953433, u'vancouver'),
  (0.000220663881639