In [13]:
# https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
import pandas as pd
from joblib import load, dump 
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import nltk
import pyLDAvis.gensim
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

from spacy.lang.en import English
import gensim

parser = English()
stopwords_set = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package wordnet to /Users/qh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def create_word_freq_table(input_df):
    bow_transformer = CountVectorizer(analyzer=clean_review).fit(input_df['review_body'])
    transformed_input = bow_transformer.transform(input_df['review_body'])
    count_vect_df = pd.DataFrame(transformed_input.todense(), columns=bow_transformer.get_feature_names())
    return count_vect_df

def clean_review(sentence):
    no_punc = [c for c in sentence if c not in string.punctuation]
    no_punc = ''.join(no_punc)
    no_stopwords = [w.lower() for w in no_punc.split() if (w not in stopwords_set) and (len(re.search('^\s*[0-9]*', w)[0]) == 0)]    
    stemmed_words = [ps.stem(w) for w in no_stopwords]
    return stemmed_words

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
            lda_tokens.append(token.lower_)
    return lda_tokens

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stopwords_set]
    tokens = [token for token in tokens if (len(re.search('^\s*[0-9]*', token)[0]) == 0)]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

  no_stopwords = [w.lower() for w in no_punc.split() if (w not in stopwords_set) and (len(re.search('^\s*[0-9]*', w)[0]) == 0)]
  tokens = [token for token in tokens if (len(re.search('^\s*[0-9]*', token)[0]) == 0)]


In [15]:
from gensim import corpora
import random
def create_application_dictionary(df, application, with_nouns=False, with_nouns_adj=False):
    topic_modeling_df = df[(df['classification'] == 'informative') & (df['application'] == application)]
    text_data = []
    for review in topic_modeling_df['review_body']:
        cleaned_review = review
        if with_nouns:
            cleaned_review = nouns(cleaned_review)
        if with_nouns_adj:
            cleaned_review = nouns_adj(cleaned_review)
        tokens = prepare_text_for_lda(cleaned_review)
        text_data.append(tokens)
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    lda_model = topic_modeling(dictionary, corpus)
    return [pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False), lda_model, dictionary]

In [16]:
def topic_modeling(dictionary, corpus, num_topics=5, passes=100):
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=passes)
    topics = lda_model.print_topics(num_words=4)
    for topic in topics:
        print(topic)
    return lda_model
    

In [17]:
json_reviews_by_sent = load('json_review_dataframe_by_sent')

In [18]:
lda_display, lda_model, dictionary = create_application_dictionary(json_reviews_by_sent, 'reddit', with_nouns_adj=True)
pyLDAvis.display(lda_display)

(0, '0.087*"comment" + 0.087*"something" + 0.033*"screen" + 0.033*"second"')
(1, '0.098*"post" + 0.051*"picture" + 0.051*"autoplay" + 0.028*"connection"')
(2, '0.042*"problem" + 0.042*"android" + 0.042*"photo" + 0.042*"continuous"')
(3, '0.064*"music" + 0.044*"issue" + 0.044*"reason" + 0.024*"refresh"')
(4, '0.122*"video" + 0.063*"quality" + 0.043*"screen" + 0.024*"black"')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# pyLDAvis.display(create_application_dictionary(json_reviews_by_sent, 'tiktok'))

In [19]:
text = json_reviews_by_sent['review_body'][0]
# tokenized = word_tokenize(text)
other_texts = [w for w in text.split(' ')]
print(other_texts)
other_corpus = [dictionary.doc2bow(word_tokenize(text)) for text in other_texts]
vector = lda_model[other_corpus]

['I', 'really', 'like', 'this', 'app', 'but', 'alot', 'of', 'things', 'could', 'be', 'fixed.']
