# [Topic Modeling](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [293]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
from gensim import parsing
from pprint import pprint
import numpy as np
np.random.seed(2018)
import nltk

# reload only imported modules before run
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

'''
pantree bank tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
standford source: https://nlp.stanford.edu/software/CRF-NER.shtml
standford online text tree generater: http://nlp.stanford.edu:8080/parser/index.jsp
'''

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'\npantree bank tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html\nstandford source: https://nlp.stanford.edu/software/CRF-NER.shtml\nstandford online text tree generater: http://nlp.stanford.edu:8080/parser/index.jsp\n'

### constants

In [294]:
virtual_env = 'VIRTUAL_ENV'
comments_path = 'comments_path'
emoji_path = 'emoji_path'
wordFile_path = 'wordFile_path'

start_date = '01-11-2018' #  09-Sep-2018 
end_date = '30-12-2018' # 01-Oct-2018

### We will perform the following steps:

#### 1. Tokenization: 
Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.

In [295]:
def tokenize(text):
    return gensim.utils.simple_preprocess(text)

#### 2. Remove small words:
Words that have fewer than 3 characters are removed.

In [296]:
def isShortWord(token):
    return len(token) < 3

#### 3. Remove stopwords:
All stopwords are removed.

In [297]:
def isStopWord(token):
    return token in gensim.parsing.preprocessing.STOPWORDS

#### 4. lemmatized +  Stemming:
Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

Words are stemmed — words are reduced to their root form.

In [298]:
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
from nltk.stem.porter import *
from textblob import TextBlob


def lemmatize_stemming(token):
    stemmer = PorterStemmer() #gensim.parsing.stem_text(tokenize) #
    for word, tag in pos_tag(word_tokenize(token)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        lemma = WordNetLemmatizer().lemmatize(word, wntag) if wntag else word
        return TextBlob(lemma).words[0].singularize()
    return ''

#### 5. Replace Emojis:

In [299]:
# ## get emoji characters file path
# def getEmojis():
#     from dataSource import getEmojis
#     comments_file_path = getDataSourcePathFor(emoji_path)
#     return getEmojis(comments_file_path)#.head()

In [300]:
# def hasEmojicon(token):
    
# def replaceEmojicons(token, emojies):
#     pass
    

## Data Source

In [301]:
def getNounList(sentence='', tokens = []):
    from nltk import word_tokenize, pos_tag
    if len(tokens) > 0:    
        nouns = [token for token, pos in pos_tag(tokens) if pos.startswith('NN')]
        return nouns
    else:
        nouns = [token for token, pos in pos_tag(word_tokenize(sentence)) if pos.startswith('NN')]
        return nouns

In [302]:
# text = 'I Have done reviewing, Will be seeing by them'
# print(preprocess(text))

In [303]:
# key is file storage path
def getDataSourcePathFor(keyForFilePath):
    import json
    import os
    
    config_file_path = os.environ[virtual_env] + '/config.json'

    with open(config_file_path) as f:
        config = json.load(f)
        if keyForFilePath in config:# ['comments_path', 'output_path']
            return config[keyForFilePath] 
    return None

In [304]:
## get list of comments from stored input csv file
from dataSource import getComments, sortedMostNeg, sortedMostPos, sortedMostFreq
def getListOfComments():
    ### This is to get csv rows between given dates
    comments_file_path = getDataSourcePathFor(comments_path)
    commentsList = getComments(comments_file_path, start_date, end_date)['comments'] 
    print('Total number of comments: %s between %s and %s' % (len(commentsList), start_date, end_date))
    return commentsList

### Text processing

In [305]:
import re
def filters(sentence):
#     print('..given comments:', sentence)
    result = []
    #nouns = getNounList(sentence) # fetch only Nouns
    for token in tokenize(sentence):#nouns: ###tokenize(text):
        if not (isStopWord(token) or isShortWord(token)):
            lemmaWord = lemmatize_stemming(token)
            if not isShortWord(lemmaWord):
                result.append("".join(re.findall("[a-zA-Z]+", lemmaWord)).lower())
    return result

def filterWords(tokens):
    return list(filter(lambda each: not (isStopWord(each) or isShortWord(each)), tokens))

In [306]:
def preprocessCommentDocument(document):
    return list(map(lambda sentence: filters(sentence), document))

### Word Cloud

In [307]:
from wordCloud import showWordCloud
def showCloud(topicCollection):
    showWordCloud(topicCollection) 

### Bag of Words on the Data set

In [335]:
def bow(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs)
#     dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    return bow_corpus

### Sentiment

In [327]:
from getSentiment import getSentiment

def getSentFromCommentList(commentList):
    sentimentList = []
    for sentence in commentList:
        sentiment = getSentiment(sentence)
        sentimentList.append(sentiment)
    return sentimentList


def isNegative(sentiment):
    return sentiment > 0.5


def tokenDictWithPosNegSentiment(sentimentList, document):
    sentDict = {}
    size = range(len(document))
    
    for i in size:
        for token in document[i]:
            v = (0, 0, 0) # (neg, pos, freq)
            if token in sentDict:
                v = sentDict[token]
                
            if isNegative(sentimentList[i]):
                v = (v[0]+1, v[1], v[2]+1)
            else:
                v = (v[0], v[1]+1, v[2]+1)
            sentDict[token] = v
    sentDict.pop('', None)
    return sentDict

### Main()

In [398]:
from dbsWordList import DBSWordList
file_path = getDataSourcePathFor(wordFile_path)
trie = DBSWordList(file_path)

..DBSWordList constructor called


In [441]:
def parseToTokens(sentence):
    #str = "Best Bank Ever. Period. I don't know what could expect a lot of time. debit card. None of the other banks in India come close. No charges, high interests, more security, no spam msgs, reliable app, efficient customer service, so many offers, no minimum balance, truly digital with no cash business hence no useless queues n stuff, widely supported debit card, fastest transfer of funds I've ever seen...I just don't know why anyone would give it 1star...those ppl should stick with sbi or worst bank ever,axis bank...banking has never been so easier since I made an account here. Thanks for great 2yrs of service...wish it lasts way longer. 😊😊👍👍"
    proc_doc_exist_words = trie.searchBySentence(sentence) 
    #print(proc_doc_exist_words)
    trieFilterWords = filterWords(proc_doc_exist_words)
    return list(set(trieFilterWords))  #getNounList('', trieFilterWords)

In [442]:
commentsDocument = getListOfComments()#.head(2)
processed_doc = commentsDocument.map(parseToTokens) #preprocessCommentDocument(commentsDocument)
# bow_corpus = bow(processed_doc)
print(processed_doc[0])

Total number of comments: 972 between 01-11-2018 and 30-12-2018
['BEST', 'free', 'app', 'good', 'BANK', 'team', 'best', 'credit', 'bank', 'digibank', 'service', 'happy']


In [443]:
dictionary = gensim.corpora.Dictionary(processed_doc)
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

In [444]:
from gensim import corpora, models
corpus_tfidf = models.TfidfModel(bow_corpus)[bow_corpus]

In [445]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [446]:
# print('..corpos is :', processed_doc[0])
# print('..bow_corpus:', bow_corpus[0])
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

..corpos is : ['BEST', 'free', 'app', 'good', 'BANK', 'team', 'best', 'credit', 'bank', 'digibank', 'service', 'happy']
..bow_corpus: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]

Score: 0.8841338753700256	 
Topic: 0.121*"bank" + 0.051*"account" + 0.037*"app" + 0.033*"slow" + 0.033*"money" + 0.029*"add" + 0.028*"not able to" + 0.027*"login" + 0.025*"not working" + 0.023*"transfer"

Score: 0.029261434450745583	 
Topic: 0.060*"update" + 0.046*"new" + 0.037*"easy" + 0.031*"app" + 0.028*"password" + 0.026*"bank" + 0.025*"version" + 0.024*"issue" + 0.024*"phone" + 0.020*"login"

Score: 0.028961647301912308	 
Topic: 0.073*"experience" + 0.057*"best" + 0.055*"service" + 0.049*"bad" + 0.038*"bank" + 0.034*"customer" + 0.032*"app" + 0.030*"account" + 0.023*"slow" + 0.020*"good"

Score: 0.028831595554947853	 
Topic: 0.164*"app" + 0.063*"App" + 0.036*"dbs" + 0.035*"good" + 0.032*"worst" + 0.025*"great" + 0.024*"login" + 0.024*"facility" + 0.023*"slow" + 0.021*"easy"

Score: 0.0288114249706268

In [447]:
topics = lda_model_tfidf.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.073*"experience" + 0.057*"best" + 0.055*"service" + 0.049*"bad"')
(1, '0.121*"bank" + 0.051*"account" + 0.037*"app" + 0.033*"slow"')
(2, '0.164*"app" + 0.063*"App" + 0.036*"dbs" + 0.035*"good"')
(3, '0.060*"update" + 0.046*"new" + 0.037*"easy" + 0.031*"app"')
(4, '0.136*"good" + 0.100*"app" + 0.061*"debit card" + 0.053*"option"')


In [None]:
l = tokenDictWithPosNegSentiment(sentList, proc_doc_exist_words)


In [None]:
print(sortedMostPos(l))

In [None]:
from showBarGraph import *
# show positive bar graph
showBarCharForSentiment(sortedMostPos(l), pos=True)

In [None]:
# print(sortedMostNeg(l))
# show positive bar graph
showBarCharForSentiment(sortedMostNeg(l), pos=False)

In [None]:
showPiChart(sortedMostFreq(l))

In [None]:
showTempBarChart()