In [37]:
import nltk.data
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from stemming.porter2 import stem
import wikipedia
from gensim import corpora, models

In [38]:
# function to parse sentences to words and remove stopwords from sentence
def sentence_to_wordlist( sentence, remove_stopwords=True ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(sentence).get_text()
    #  
    # 2. Remove non-letters
    text = re.sub("[^a-zA-Z]"," ",text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [39]:
# function to parse text to sentences using tokenizer mentioned above
def text_to_sentences(text,tokenizer,remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(text.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( sentence_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [40]:
# tokenize to sentences based on the notations followed in english literature
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#import wikipedia page for 
content = wikipedia.page('barack').content
#print content

#tokenise content and store it in text
sentences = []
sentences = text_to_sentences(content, tokenizer)
print sentences


[[u'barack', u'hussein', u'obama', u'ii', u'us', u'b', u'r', u'k', u'hu', u'se', u'n', u'b', u'b', u'rahk', u'hoo', u'sayn', u'oh', u'bah', u'born', u'august', u'american', u'politician', u'served', u'th', u'president', u'united', u'states'], [u'first', u'african', u'american', u'served', u'president', u'well', u'first', u'born', u'outside', u'contiguous', u'united', u'states'], [u'previously', u'served', u'u', u'senate', u'representing', u'illinois', u'illinois', u'state', u'senate'], [u'obama', u'born', u'honolulu', u'hawaii', u'two', u'years', u'territory', u'admitted', u'union', u'th', u'state'], [u'grew', u'mostly', u'hawaii', u'also', u'spent', u'one', u'year', u'childhood', u'washington', u'state', u'four', u'years', u'indonesia'], [u'graduating', u'columbia', u'university', u'worked', u'community', u'organizer', u'chicago'], [u'obama', u'enrolled', u'harvard', u'law', u'school', u'first', u'black', u'president', u'harvard', u'law', u'review'], [u'graduation', u'became', u'civil

In [41]:
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# stem token
documents = [[stem(word) for word in sentence] for sentence in sentences]


# Dictionary() function creates document-term-matrix and assign unique id to tokens
dictionary = corpora.Dictionary(documents)

#dictionary should be converted to bag of words
corpus = [dictionary.doc2bow(sentence) for sentence in documents]



In [42]:
#construct lda model which gives top 3 relevant topics
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=3, num_words=3))

#Each generated topic is separated by a comma, 
#Within each topic are the three most probable words to appear in that topic
# Adjusting the modelâ€™s number of topics and passes is important for getting a good result

[(0, u'0.023*"obama" + 0.008*"u" + 0.008*"elect"'), (1, u'0.038*"obama" + 0.013*"act" + 0.012*"presid"'), (2, u'0.034*"obama" + 0.013*"presid" + 0.009*"state"')]
