In [8]:
#ntlk imports
import nltk
from nltk import sent_tokenize,word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

#prettyprint
import pprint 

#genism imports
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS

stopwords = [] # user defined stopwords

def process_article(text):
    sents = sent_tokenize(text)
    sents = '. '.join([s.strip().replace("\n","") for s in sents])
    words = [word for word in sents.lower().split() 
             if word not in STOPWORDS and word.isalnum() and word not in stopwords]
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(i) for i in words]
    p_stemmer = PorterStemmer()
    words = [p_stemmer.stem(i) for i in words]
    return words

def get_corpus(text):   
    a_list = [process_article(text)]
    dictionary = corpora.Dictionary(a_list)
    return [dictionary.doc2bow(a) for a in a_list]

def get_lda(a_dict,num_topics,num_passes):
    texts = [c for a,b,c in a_dict.values()]
    texts = [process_article(a) for a in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(a) for a in texts]
    lda = LdaModel(corpus,               #list of lists containing tuples (word index, word freq)
                  id2word=dictionary,    #change nums back to words
                  num_topics=num_topics, #need to set num topics
                  passes=num_passes)
    return lda

def print_lda(lda,num_words=8):
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words=8)) #create prettyprint obj, 8 words for each topic

def get_topic(new_article):
    from operator import itemgetter
    new_a = get_corpus(new_article)
#     lda.get_document_topics(new_a[0],minimum_probability=0.05,per_word_topics=False)
    return sorted(lda.get_document_topics(new_a[0],
                    minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)

In [11]:
lda = get_lda(x,2,10)
print_lda(lda)

[   (   0,
        '0.011*"news" + 0.010*"report" + 0.007*"said" + 0.007*"stori" + '
        '0.006*"guard" + 0.006*"tuesday" + 0.006*"ramo" + 0.005*"cb"'),
    (   1,
        '0.010*"said" + 0.010*"king" + 0.009*"trebek" + 0.008*"cb" + '
        '0.007*"gayl" + 0.007*"want" + 0.007*"air" + 0.006*"interview"')]


In [12]:
a = "'A version of this article first appeared in the Reliable Sources newsletter. You can sign up for free right here.   This is an incredibly difficult time for Alex Trebek, his family members, and the extended Jeopardy! family that spans the globe. Trebek showed tremendous courage by recording a candid video message to fans about his stage 4 pancreatic cancer diagnosis. He even managed to work in a joke about being under contract for three more years. Trebek was diagnosed earlier this week, and his video was released on Wednesday afternoon.  In a time that is all about what is keeping us apart, we got tough news today about someone who has always brought America together, literally for decades, CNN\'s Chris Cuomo said Wednesday night. I don\'t care what your race, color, creed, gender, or bank account level, you\'ve watched Jeopardy. Since 1984 Alex Trebek has been the smartest guy in our living rooms, teaching us, but more importantly, bringing us together. Trebek\'s show puts facts first, Cuomo said, and we need him, now mo"
get_topic(a)

[(1, 0.9920418), (0, 0.007958193)]