In [3]:
import PyPDF2

In [4]:
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
import string

In [6]:
import base64

In [7]:
def print_most_common_freq_words(words, n=10):
    fdist = FreqDist(words)
    for word, frequency in fdist.most_common(n):
        print ('{};{}'.format(word, frequency))


In [8]:
def read_book(name):
    file = open(name,'rb')
    book = PyPDF2.PdfFileReader(file)
    return book

In [9]:
def get_text(book, start, end):
    text = ''
    pages = []
    for idx in range(start,end,1):
        page = book.getPage(idx)
        text = page.extractText()
        # text.encode('utf-8')
        pages.append(text)
    return pages

In [10]:
def get_sentences(text):
    return sent_tokenize(text)

In [11]:
def get_words_list(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    # tokens = nltk.word_tokenize(text)
    txt = nltk.Text(tokens)
    words = nltk.tokenize.word_tokenize(text)
    return [w.lower() for w in words]

In [12]:
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in words if not w in stop_words]
    filtered_sentence = ' '.join(filtered_words)
    words = nltk.tokenize.word_tokenize(filtered_sentence)
    return words

In [13]:
def remove_punctuations_and_small_words(words, word_len):
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if len(w)>word_len]
    return words

In [14]:
def stem_words(words):
    # stemming of words
    porter = PorterStemmer()
    return [porter.stem(w) for w in words]

In [15]:
def lemmatize_words(words):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(w).encode('utf-8') for w in words]

In [16]:
def pos_tagging(text):
    return nltk.pos_tag(text)

In [17]:
def get_pos_text(word_tag_pairs, pos):
    word_fd = [word for (word, tag) in pos_text if tag in pos]
    return set(word_fd)

In [18]:
def get_dict(doc):
    from gensim import corpora
    dictionary = corpora.Dictionary(doc)
    return dictionary

In [19]:
def get_df_matrix(doc_clean, dictionary):
    import gensim
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return doc_term_matrix

In [20]:
def fit_lda_model(doc_term_matrix, dictionary, topic_count):
    from gensim.models.ldamodel import LdaModel
    ldamodel = LdaModel(doc_term_matrix, num_topics=topic_count, id2word = dictionary, passes=50)
    return ldamodel

In [21]:
def fit_tfidf_model(corpus):
    from gensim.models import TfidfModel
    model = TfidfModel(corpus)  # fit model
    return model

In [22]:
def fit_hdp_model(doc_term_matrix, dictionary):
    from gensim.models import HdpModel
    hdp = HdpModel(doc_term_matrix, dictionary)
    return hdp

In [23]:
book_name = 'Influence the psychology of persuasion.pdf'
topics = {
    'chap1':{'start':10,'end':22},\
    'chap2':{'start':23,'end':52},\
    'chap3':{'start':53,'end':96},\
    'chap4':{'start':97,'end':135},\
    'chap5':{'start':136,'end':166},\
    'chap6':{'start':167,'end':187},\
    'chap7':{'start':188,'end':214}\
}

In [24]:
chaps = {}

book = read_book(book_name)
for k,v in topics.items():

    chaps[k] = get_text(book, v['start'], v['end'])

    chapterwise_topics = {}

    chapterwise_bigrm_topics = {}

    for k,v in chaps.items():

        print k

        text = v[0]

        sentences = get_sentences(text)

        print len(sentences)

        end = len(sentences)

        start = 0

        doc_clean = []

        bigrm_doc = []

        for idx in range(start,end,1):

            sent = sentences[idx]


            words = get_words_list(sent)

            stop_free_words = remove_stopwords(words)


            punct_free_words = remove_punctuations_and_small_words(stop_free_words, 2)


            stemmed = stem_words(punct_free_words)


            lemmatized = lemmatize_words(punct_free_words)


            #Bigrams

            bigrm = list(nltk.bigrams(lemmatized))


            bigrm_doc.append([' '.join((a,b)) for a,b in bigrm])

            

            pos_text = pos_tagging(lemmatized)

            

            tag_fd = [tag for (word, tag) in pos_text]

            

            noun_text = get_pos_text(pos_text, ["NN", "VBG", "VBN"])

            

            doc_clean.append(lemmatized)


        print "Entering to Topic Modeling"

        print "Length of doc_clean list %s" %len(doc_clean)

        #print doc_clean

        dictionary = get_dict(doc_clean)

        doc_term_matrix = get_df_matrix(doc_clean, dictionary)

        

        topic_count = 3

        ldamodel = fit_lda_model(doc_term_matrix, dictionary, topic_count)

        print("Topics unigram")

        

        chapterwise_topics[k] = ldamodel

        print "TFIDF model"

        tfidfmodel = fit_tfidf_model(doc_term_matrix)

        print tfidfmodel

        

        print 50*"@"

        hdp = fit_hdp_model(doc_term_matrix, dictionary)

        print hdp.print_topics(num_topics=3, num_words=3)

        print 50*"@"


    print 50 * "*"

    print len(chapterwise_topics)

    for chap, model in chapterwise_topics.items():

        print chap, model.print_topics(num_topics=3, num_words=3)

        import matplotlib.pyplot as plt

        from wordcloud import WordCloud

        for t in range(model.num_topics):

            wc = WordCloud().fit_words(dict(model.show_topic(t, 200)))

            plt.figure()

            plt.imshow(wc, interpolation='bilinear')

            plt.axis("off")

            plt.title("Topic #" + k+str(t))

            

            plt.savefig(chap+str(t))



chap7
14
Entering to Topic Modeling
Length of doc_clean list 14
Topics unigram
TFIDF model
TfidfModel(num_docs=14, num_nnz=211)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.023*appeal + 0.019*becomes + 0.017*cause'), (1, u'0.046*stand + 0.026*term + 0.022*cessful'), (2, u'0.030*importance + 0.020*result + 0.020*temple')]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
**************************************************
1
chap7 [(0, u'0.024*"health" + 0.024*"breast" + 0.017*"potential"'), (1, u'0.017*"could" + 0.017*"something" + 0.017*"thought"'), (2, u'0.016*"temple" + 0.016*"never" + 0.016*"become"')]
chap7
14
Entering to Topic Modeling
Length of doc_clean list 14
Topics unigram
TFIDF model
TfidfModel(num_docs=14, num_nnz=211)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.026*play + 0.025*action + 0.021*general'), (1, u'0.027*carry + 0.024*blurred + 0.023*ongoing'), (2, u'0.029*importance + 0.028*spoke + 0.023*intent')]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@



chap6 [(0, u'0.030*"shock" + 0.017*"level" + 0.017*"first"'), (1, u'0.039*"teacher" + 0.034*"test" + 0.033*"shock"'), (2, u'0.043*"shock" + 0.034*"teacher" + 0.011*"error"')]
chap5 [(0, u'0.039*"friend" + 0.024*"like" + 0.017*"compliance"'), (1, u'0.022*"product" + 0.022*"social" + 0.013*"hostess"'), (2, u'0.038*"party" + 0.024*"tupperware" + 0.024*"sale"')]
chap4 [(0, u'0.028*"executive" + 0.019*"know" + 0.019*"television"'), (1, u'0.028*"correct" + 0.028*"behavior" + 0.019*"material"'), (2, u'0.022*"action" + 0.022*"laughter" + 0.015*"principle"')]
chap7
14
Entering to Topic Modeling
Length of doc_clean list 14
Topics unigram
TFIDF model
TfidfModel(num_docs=14, num_nnz=211)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.029*general + 0.029*cessful + 0.022*opportunitiesseem'), (1, u'0.036*unknown + 0.029*became + 0.026*compelling'), (2, u'0.037*stance + 0.024*phenomenon + 0.021*stirring')]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
chap6
24
Entering to Topic Model

Topics unigram
TFIDF model
TfidfModel(num_docs=22, num_nnz=223)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.034*afloat + 0.027*way + 0.021*thing'), (1, u'0.034*theater + 0.033*one + 0.033*others'), (2, u'0.035*effective + 0.033*mechanically + 0.028*introduction')]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
chap3
24
Entering to Topic Modeling
Length of doc_clean list 24
Topics unigram
TFIDF model
TfidfModel(num_docs=24, num_nnz=236)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.027*people + 0.027*weshould + 0.025*willingness'), (1, u'0.030*volunteered + 0.027*done + 0.025*would'), (2, u'0.025*difficult + 0.024*say + 0.020*dated')]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
chap2
15
Entering to Topic Modeling
Length of doc_clean list 15
Topics unigram
TFIDF model
TfidfModel(num_docs=15, num_nnz=250)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[(0, u'0.019*mexico + 0.018*person + 0.017*result'), (1, u'0.025*indebtedness + 0.022*flowing 

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk
