# [Topic Modeling](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [52]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
from gensim import parsing

import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/nawaz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### constants

In [53]:
virtual_env = 'VIRTUAL_ENV'
comments_path = 'comments_path'
emoji_path = 'emoji_path'

### We will perform the following steps:

#### 1. Tokenization: 
Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.

In [54]:
def tokenize(text):
    return gensim.utils.simple_preprocess(text)

#### 2. Remove small words:
Words that have fewer than 3 characters are removed.

In [55]:
def isShortWord(token):
    return len(token) < 3

#### 3. Remove stopwords:
All stopwords are removed.

In [56]:
def isStopWord(token):
    return token in gensim.parsing.preprocessing.STOPWORDS

#### 4. lemmatized +  Stemming:
Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

Words are stemmed — words are reduced to their root form.

In [57]:
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
from nltk.stem.porter import *


def lemmatize_stemming(token):
    stemmer = PorterStemmer() #gensim.parsing.stem_text(tokenize) #
    for word, tag in pos_tag(word_tokenize(token)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        lemma = WordNetLemmatizer().lemmatize(word, wntag) if wntag else word
        return lemma
    return ''

#### 5. Replace Emojis:

In [58]:
# ## get emoji characters file path
# def getEmojis():
#     from dataSource import getEmojis
#     comments_file_path = getDataSourcePathFor(emoji_path)
#     return getEmojis(comments_file_path)#.head()

In [59]:
# def hasEmojicon(token):
    
# def replaceEmojicons(token, emojies):
#     pass
    

## Data Source

In [60]:
def getNounList(sentence):
    from nltk import word_tokenize, pos_tag
    nouns = [token for token, pos in pos_tag(word_tokenize(sentence)) if pos.startswith('NN')]
#     print('..pos tag:..', pos_tag(word_tokenize(sentence)))
#     print('..Noun list is: ', nouns)
    return nouns

In [61]:
# text = 'I Have done reviewing, Will be seeing by them'
# print(preprocess(text))

In [62]:
# key is file storage path
def getDataSourcePathFor(keyForFilePath):
    import json
    import os
    
    config_file_path = os.environ[virtual_env] + '/config.json'

    with open(config_file_path) as f:
        config = json.load(f)
        return config[keyForFilePath] # ['comments_path', 'output_path']
    return None

In [63]:
## get list of comments from stored input csv file
def getListOfComments():
    from dataSource import getComments
    ### This is to get csv rows between given dates
    start_date = '27-09-2018'
    end_date = '02-10-2018'
    comments_file_path = getDataSourcePathFor(comments_path)
    return getComments(comments_file_path, start_date, end_date)['comments']

### Text processing

In [100]:
def filters(sentence):
    print('..given comments:', sentence)
    result = []
    nouns = getNounList(sentence) # fetch only Nouns
    for token in nouns: ###tokenize(text):
        if not (isStopWord(token) or isShortWord(token)):
            lemmaWord = lemmatize_stemming(token)
            if not isShortWord(lemmaWord):
                result.append(lemmaWord)
    print('\n..tokens:', result)
    print('\n\n')
    return result

In [92]:
def preprocessCommentDocument(document):
    return list(map(lambda sentence: filters(sentence), document))

### Word Cloud

In [None]:
from wordCloud import showWordCloud
def showColud():
    showWordCloud(topicCollection) 

### Bag of Words on the Data set

In [126]:
def bow(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs)
    # TODO:- need to tune parameter if doc size is large
    dictionary.filter_extremes(no_below=0, no_above=0.5, keep_n=100000)
    print(dictionary)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    print(bow_corpus)
    print(dictionary.token2id)
#     for each in dictionary:
#         print(each),

### Main()

In [127]:
def main():
    commentsDocument = getListOfComments().head(2)
    showCloud(commentsDocument)
    processed_doc = preprocessCommentDocument(commentsDocument)
    bow(processed_doc)

In [128]:
main()

..given comments: Very good app and lots of features. Debit card is so easy to use and it's all free. It's monthly spendings tracker features is so awesome. Love it. Overall love this app and it's one of best in market. One thing I wish to have is credit card facilities for the Indian salaried persons.

..tokens: ['app', 'lot', 'feature', 'Debit', 'card', 'spending', 'tracker', 'feature', 'love', 'app', 'market', 'thing', 'credit', 'card', 'facility', 'person']



..given comments: Worst banking app. After tried for various times I am unable to log in to my account. God knows wht wll happen to my hard earned money😠😠🤯

..tokens: ['Worst', 'banking', 'app', 'time', 'account', 'God', 'happen', 'money😠😠\U0001f92f']



Dictionary(19 unique tokens: ['account', 'feature', 'credit', 'tracker', 'God']...)
[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]
{'account': 14, 'feature':