# [Topic Modeling](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [1]:

from pprint import pprint
import numpy as np
np.random.seed(2018)
import nltk

# reload only imported modules before run
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

'''
pantree bank tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
standford source: https://nlp.stanford.edu/software/CRF-NER.shtml
standford online text tree generater: http://nlp.stanford.edu:8080/parser/index.jsp
'''

'\npantree bank tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html\nstandford source: https://nlp.stanford.edu/software/CRF-NER.shtml\nstandford online text tree generater: http://nlp.stanford.edu:8080/parser/index.jsp\n'

### constants

In [2]:
start_date = '01-11-2018' #  09-Sep-2018 
end_date = '30-12-2018' # 01-Oct-2018

### We will perform the following steps:

#### 1. Tokenization: 
Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.

In [3]:
# def tokenize(text):
#     return gensim.utils.simple_preprocess(text)

#### 2. Remove small words:
Words that have fewer than 3 characters are removed.

In [4]:
# def isShortWord(token):
#     return len(token) < 3

#### 3. Remove stopwords:
All stopwords are removed.

In [5]:
# def isStopWord(token):
#     return token in gensim.parsing.preprocessing.STOPWORDS

#### 4. lemmatized +  Stemming:
Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

Words are stemmed — words are reduced to their root form.

In [6]:
# from nltk.corpus import wordnet as wn
# from nltk import pos_tag, word_tokenize
# from nltk.stem.porter import *
# from textblob import TextBlob


# def lemmatize_stemming(token):
#     stemmer = PorterStemmer() #gensim.parsing.stem_text(tokenize) #
#     for word, tag in pos_tag(word_tokenize(token)):
#         wntag = tag[0].lower()
#         wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
#         lemma = WordNetLemmatizer().lemmatize(word, wntag) if wntag else word
#         return TextBlob(lemma).words[0].singularize()
#     return ''

#### 5. Replace Emojis:

In [7]:
# ## get emoji characters file path
# def getEmojis():
#     from dataSource import getEmojis
#     comments_file_path = getDataSourcePathFor(emoji_path)
#     return getEmojis(comments_file_path)#.head()

In [8]:
# def hasEmojicon(token):
    
# def replaceEmojicons(token, emojies):
#     pass
    

## Data Source

In [9]:
# def getNounList(sentence='', tokens = []):
#     from nltk import word_tokenize, pos_tag
#     if len(tokens) > 0:    
#         nouns = [token for token, pos in pos_tag(tokens) if pos.startswith('NN')]
#         return nouns
#     else:
#         nouns = [token for token, pos in pos_tag(word_tokenize(sentence)) if pos.startswith('NN')]
#         return nouns

In [10]:
# text = 'I Have done reviewing, Will be seeing by them'
# print(preprocess(text))

In [11]:
# key is file storage path
# def getDataSourcePathFor(keyForFilePath):
#     import json
#     import os
    
#     config_file_path = os.environ[virtual_env] + '/config.json'

#     with open(config_file_path) as f:
#         config = json.load(f)
#         if keyForFilePath in config:# ['comments_path', 'output_path']
#             return config[keyForFilePath] 
#     return None

In [12]:
# ## get list of comments from stored input csv file
# import dataSource 

# def getListOfComments():
#     ### This is to get csv rows between given dates
#     comments_file_path = getDataSourcePathFor(comments_path)
#     commentsList = getComments(comments_file_path, start_date, end_date) #['comments'] 
#     commentsList = commentsList.sort_values(by='ratings', ascending=True)['comments'] 
#     print('Total number of comments: %s between %s and %s' % (len(commentsList), start_date, end_date))
#     return commentsList

### Text processing

In [13]:
# import re

# def filterWord(token):
#     if not (isStopWord(token) or isShortWord(token)):
#         lemmaWord = lemmatize_stemming(token)
#         if not isShortWord(lemmaWord):
#             return ("".join(re.findall("[a-zA-Z]+", lemmaWord)).lower())
#     return None 
    
# def filters(sentence):
# #     print('..given comments:', sentence)
#     result = []
#     #nouns = getNounList(sentence) # fetch only Nouns
#     for token in tokenize(sentence):#nouns: ###tokenize(text):
#         result.append(filterWord(token))
#     return result

# def filterWords(tokens):
#     return list(filter(lambda token: filterWord(token), tokens))

In [14]:
# def preprocessCommentDocument(document):
#     return list(map(lambda sentence: filters(sentence), document))

### Word Cloud

In [15]:
from wordCloud import showWordCloud
def showCloud(topicCollection):
    showWordCloud(topicCollection) 

### Bag of Words on the Data set

In [16]:
def bow(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    return bow_corpus

### Sentiment

In [17]:
from getSentiment import getSentiment

def getSentFromCommentList(commentList):
    sentimentList = []
    for sentence in commentList:
        sentiment = getSentiment(sentence)
        sentimentList.append(sentiment)
    return sentimentList


def isNegative(sentiment):
    return sentiment > 0.5


def tokenDictWithPosNegSentiment(sentimentList, document):
    sentDict = {}
    size = range(len(document))
    
    for i in size:
        for token in document[i]:
            v = (0, 0, 0) # (neg, pos, freq)
            if token in sentDict:
                v = sentDict[token]
                
            if isNegative(sentimentList[i]):
                v = (v[0]+1, v[1], v[2]+1)
            else:
                v = (v[0], v[1]+1, v[2]+1)
            sentDict[token] = v
    sentDict.pop('', None)
    return sentDict 

### Main()

In [244]:
import SDataSource
import SConstants
from SWordList import SWordList
from sUtility import SUtility
from sPreprocessor import SPreprocessor


In [245]:
sutility = SUtility()
spreprocessor = SPreprocessor()

# SPreprocessor.resolveDependancy(trieCommon)
dateBetween = [start_date, end_date]
commentsDocument = SDataSource.getListOfComments(dateBetween).head(1000)

# document preprocessing, cleaning, filtering, replacement, spliting into multiple senetnces from one
processed_doc = []
for sentence in commentsDocument:
    for each in spreprocessor.docCleaning(sentence):
        processed_doc.append(each)


Total number of comments: 972 between 01-11-2018 and 30-12-2018


In [265]:
def makeDict(sentence):
    t = spreprocessor.parseToTokens(trieTopic, sentence)
    r = spreprocessor.parseToTokens(trieNReason, sentence)
#     print(sentence)
#     print('topic is:', t)
#     print('reasons are:', r)
#     print()
    sutility.dump(t, r)

In [266]:
wordList_file = SDataSource.getDataSourcePathFor(SConstants.wordFile_path)
trieCommon = SWordList(wordList_file)

topic_file = SDataSource.getDataSourcePathFor(SConstants.topic_path)
trieTopic = SWordList(topic_file)

n_reason_file = SDataSource.getDataSourcePathFor(SConstants.n_reason_path)
trieNReason = SWordList(n_reason_file)

..SWordList constructor called 
..SWordList constructor called 
..SWordList constructor called 


In [267]:
l = ['crashing', 'crashes', 'upi']
print(spreprocessor.filterWords(l))
    

['crash', 'crash', 'upi']


In [268]:
# import nltk
# sno = nltk.stem.SnowballStemmer('english')
# print(sno.stem('upi'))
# print(sno.stem('debitance'))
# print(sno.stem('fairly'))

In [269]:
for each in processed_doc:
    makeDict(each)

In [270]:
sutility.showTopicCounts()
print('---------------------------------')
sutility.showReasonDict()

[('app', 2015), ('account', 600), ('login', 465), ('debit card', 355), ('updat', 244), ('upi', 210), ('version', 100), ('atm', 85), ('customer care', 80), ('kyc', 75), ('balanc', 65), ('customer service', 55), ('debit cards', 40), ('look', 35), (None, 30), ('offer', 30), ('speed', 30), ('secur', 25), ('internet', 20), ('biometr', 20), ('charg', 15), ('signup', 15), ('ifsc', 9), ('international transactions', 5), ('deals and offers', 5), ('another bank', 5), ('back option', 5), ('postal code', 5)]
---------------------------------
topic:  app
[('open', 120), ('not working', 80), ('crash', 45), ('every time', 40), ('bad', 25), ('servic', 25), ('problem', 25), ('chang', 25), ('stop', 20), ('popup', 20), ('reset', 20), ('unstabl', 15), ('frustrat', 15), ('something went wrong', 15), ('buggi', 10), ('screen', 10), ('suck', 10), ('transfer', 10), ('stuck', 10), ('broke', 5), ('forc', 5), ('incorrect', 5), ('deposit', 5), ('fix it', 5), ('worst experience', 5), ('reflect', 3)]

topic:  accoun

In [48]:
sutility.showTopicCounts()
# sentList = commentsDocument.map(getSentiment)

In [105]:
l = list(commentsDocument)
p = list(processed_doc)

length = len(l)

for i in range(length):
    print(l[i])
    print(p[i])
    print()

i am not able to add amount from another upi acoount or also not able to transfer from another banks account it shows me benificary account is inactive or major problem not setup a upi id
i am cant add amount from another upi acoount or also cant transfer from another banks account it shows me benificary account is inactive or major problem not setup a upi id

Now i am facing another problem..i can't login don't know why solve the problem as soon as possible
now i am facing another problem

The lastest update broke the app.
i cant login dont know why solve the problem as soon as possible

Kya yr debit card se funds load nai kar sakte isme...koi kaam ka nai hai ye bank fir
the lastest update broke the app

Sir app is not opening and ifsc of most of the bank don't reflect. it was my favorite app but not now
kya yr debit card se funds load nai kar sakte isme

Hating your Services. Your Technical issues were never ending & Highly absence of Technological advances. Now, required to Stop you

too slow

Great
something went wrong error

Good app. One app for all kind of transaction. Only problem is app is not too fast. Lags a lot.
repeatedly asking for upi registration even after registering for upi

Banking experience is good ,but apps is too much slow
 doesnt allow to skip

Unable to log in. It's showing Not secure network.
 need to register for new upi

Please provide cheque book
 everytime i need to use this app

How to apply dbs personal loan.Show in apps
 ended up having  upi ids

That's good
thank you for stopping collection of data

Everything is good but the only problem is very poorly built UI
what a poor service from dbs

It can't take proper otp
i trans money from dbs to my other bank 

Paper less banking good progress
money got debited from dbs account but deposited to my other bank account 

Why not add any card..
its fraud

When compare with other banking apps dbs is quiet easy to use...
the verification by otp does not work

Not bad
 so whats the point



In [106]:
dump = list(zip(commentsDocument, processed_doc))
length = len(dump)
for i in range(length):
    print(dump[i][0])
    print(dump[i][1])
    print(sentList.iloc[i])
    print()

i am not able to add amount from another upi acoount or also not able to transfer from another banks account it shows me benificary account is inactive or major problem not setup a upi id
i am cant add amount from another upi acoount or also cant transfer from another banks account it shows me benificary account is inactive or major problem not setup a upi id


NameError: name 'sentList' is not defined

In [107]:
dictionary = gensim.corpora.Dictionary(processed_doc)
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

NameError: name 'gensim' is not defined

In [None]:
from gensim import corpora, models
corpus_tfidf = models.TfidfModel(bow_corpus)[bow_corpus]

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [None]:
# indexFor = 5
# print('..COMMENT :', commentsDocument[indexFor])
# print('\n..CORPOS :', processed_doc[indexFor])
# # print('\n..BOW:', bow_corpus[indexFor])
# for index, score in sorted(lda_model_tfidf[bow_corpus[indexFor]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

In [None]:
topics = lda_model_tfidf.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
# print(processed_doc.values)

In [None]:
l = tokenDictWithPosNegSentiment(list(sentList), list(processed_doc))


In [None]:
# print(sortedMostPos(l))

In [None]:
from showBarGraph import *
# show positive bar graph
showBarCharForSentiment(sortedMostPos(l), pos=True)

In [None]:
# print(sortedMostNeg(l))
# show positive bar graph
showBarCharForSentiment(sortedMostNeg(l), pos=False)

In [None]:
showPiChart(sortedMostFreq(l))

In [None]:
# showTempBarChart()