# Topic Models



In [2]:
# libraries that we need
import pandas as pd
import nltk
import csv
import re # regular expression python
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [3]:
# LOAD THE INPUT FILE DH_CollectingData2022_review.tsv 
input_file = 'phrases_dataset.tsv'

## Exploring the dataset.


In [4]:
# opening the file using pandas and storing it into a dataframe 
dataset = pd.read_csv(input_file, delimiter="\t", quoting=csv.QUOTE_NONE, header=None)
dataset.columns =['sentence', 'score']
dataset.head()

Unnamed: 0,sentence,score
0,"For Nik, he only wants to silence the cacophon...",0.0
1,"""I can play this two ways",0.0
2,"Mild, because it isn't conclusive, and doesn't...",-1.0
3,You can also get some more information about t...,0.0
4,"Soon, Hero, who has never had friends, is thru...",0.0


Create a data series that contains all the sentences in the dataset and remmove all characters that are not alphanumeric or underscore.

In [38]:
extract_sentence = dataset['sentence'].tolist() # converting the pandas series to a python list
# Removing all not alphanumeric charecters using the regular expression method 
# from https://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
extract_sentence = [re.sub(r'[^\w]', ' ', sent) for sent in extract_sentence]
print(extract_sentence)

['For Nik  he only wants to silence the cacophony of sounds and colors he sees and for Fiona  it is the hope of a new life in Costa Rica', ' I can play this two ways', 'Mild  because it isn t conclusive  and doesn t give us the information we need to fully appreciate the story that Lloyd is telling', 'You can also get some more information about the books and writing in my exclusive interview with author Belinda Crawford at  https   goo', 'Soon  Hero  who has never had friends  is thrust in a school where she quickly has to decide who are her friends and who aren t', 'I did like Steven  or Stephen  I listened to the book', 'The plot is quick moving and the action is violent', 'Loved everything about this book', 'What happened to the sorceress to make her hate people so  What is her purpose in locking up the girl  Does that somehow ensure her possession  or the girl s purity  If so  why is purity important to her ', 'Great  quick read', 'Although there isn t character development  as is

## TF-IDF
Define the TF-IDF vectorizer to count each word in the whole corpus. It generates raw data where the first column represents the position in the matrix of a word in "sentences" and the second column represents the frequency of the given word. 

In [39]:
#convert to string 
whole_corpus = [" ".join(extract_sentence)]

In [40]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
sentence_tfidf = tfidf_vectorizer.fit_transform(whole_corpus)
print(sentence_tfidf)


  (0, 719)	0.0068940929142466715
  (0, 209)	0.0068940929142466715
  (0, 1097)	0.0068940929142466715
  (0, 737)	0.0068940929142466715
  (0, 787)	0.0068940929142466715
  (0, 130)	0.0068940929142466715
  (0, 411)	0.0068940929142466715
  (0, 1046)	0.0068940929142466715
  (0, 131)	0.0068940929142466715
  (0, 409)	0.0068940929142466715
  (0, 1321)	0.0068940929142466715
  (0, 1386)	0.0068940929142466715
  (0, 1119)	0.0068940929142466715
  (0, 1304)	0.0068940929142466715
  (0, 353)	0.0068940929142466715
  (0, 319)	0.0068940929142466715
  (0, 1447)	0.0068940929142466715
  (0, 1525)	0.0068940929142466715
  (0, 740)	0.0068940929142466715
  (0, 750)	0.0068940929142466715
  (0, 11)	0.0068940929142466715
  (0, 872)	0.0068940929142466715
  (0, 1440)	0.0068940929142466715
  (0, 1441)	0.0068940929142466715
  (0, 1132)	0.0068940929142466715
  :	:
  (0, 811)	0.0068940929142466715
  (0, 1293)	0.24818734491288016
  (0, 73)	0.0068940929142466715
  (0, 548)	0.03447046457123336
  (0, 923)	0.013788185828493343

Given the raw data we use the function get_feature_names to generate the matrix that represents the TF-IDF (the originality of each word) of each word of the corpus for each sentence(There are 390 sentences and 1543 words). For example the first row and column "100" means that the word "100" has TF-IDF of zero. The sum of each column represents the TF-IDF that each word appears in the whole corpus. 

In [41]:
sentence_tfidf_matrix = pd.DataFrame(sentence_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())
print(sentence_tfidf_matrix)

        100        12        14        15      1920        20      2013  \
0  0.006894  0.006894  0.013788  0.006894  0.006894  0.006894  0.006894   

         50       600       666  ...     wrote        xd      yeah      year  \
0  0.020682  0.006894  0.013788  ...  0.006894  0.013788  0.006894  0.020682   

      years    yelled   yelling     young      zero     zusak  
0  0.013788  0.013788  0.006894  0.006894  0.006894  0.006894  

[1 rows x 1543 columns]


Extract the top 20 TF-IDF keywords per sentnece

In [42]:
# convert the concatenated dataframe into a dictionary
target_dict_tf_idf = sentence_tfidf_matrix.to_dict('index')
# store the top 20 keywords and print them
data_repr_tf_idf = {}
for sentence_id, target_words in target_dict_tf_idf.items():
    list_targets = [(k, v) for k, v in target_words.items()]
    list_targets_sorted = sorted(list_targets, key=lambda x: x[1], reverse=True)
    data_repr_tf_idf[sentence_id] = list_targets_sorted[0:20]

print(data_repr_tf_idf)

{0: [('book', 0.5308451543969938), ('read', 0.3240223669695936), ('story', 0.24818734491288016), ('just', 0.17924641577041345), ('characters', 0.15856413702767344), ('good', 0.13788185828493343), ('like', 0.13098776537068677), ('really', 0.13098776537068677), ('great', 0.11030548662794674), ('love', 0.11030548662794674), ('novel', 0.11030548662794674), ('character', 0.10341139371370008), ('reading', 0.0965173007994534), ('think', 0.0965173007994534), ('did', 0.08272911497096005), ('world', 0.08272911497096005), ('doesn', 0.07583502205671339), ('interesting', 0.07583502205671339), ('liked', 0.07583502205671339), ('little', 0.07583502205671339)]}


In [43]:
dataframe_keywords = pd.DataFrame.from_dict(data_repr_tf_idf,orient='index', columns=[f'keyword{i}' for i in range(1,21)])  

In [44]:
dataframe_keywords

Unnamed: 0,keyword1,keyword2,keyword3,keyword4,keyword5,keyword6,keyword7,keyword8,keyword9,keyword10,keyword11,keyword12,keyword13,keyword14,keyword15,keyword16,keyword17,keyword18,keyword19,keyword20
0,"(book, 0.5308451543969938)","(read, 0.3240223669695936)","(story, 0.24818734491288016)","(just, 0.17924641577041345)","(characters, 0.15856413702767344)","(good, 0.13788185828493343)","(like, 0.13098776537068677)","(really, 0.13098776537068677)","(great, 0.11030548662794674)","(love, 0.11030548662794674)","(novel, 0.11030548662794674)","(character, 0.10341139371370008)","(reading, 0.0965173007994534)","(think, 0.0965173007994534)","(did, 0.08272911497096005)","(world, 0.08272911497096005)","(doesn, 0.07583502205671339)","(interesting, 0.07583502205671339)","(liked, 0.07583502205671339)","(little, 0.07583502205671339)"


## Topic Modelling



In [45]:
!pip install pyLDAvis

!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     --------------------------------------- 13.9/13.9 MB 11.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [46]:
import re
import nltk
nltk.download('stopwords')
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import everygrams
from nltk.util import ngrams
import spacy
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
# Preprocessing step that tokenizes all the sentences
# from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [48]:
extract_sentence_positive = dataset.loc[dataset['score']==1]['sentence'].tolist()
# Removing all not alphanumeric charecters using the regular expression method 
# from https://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
extract_sentence_positive = [re.sub(r'[^\w]', ' ', sent) for sent in extract_sentence_positive]
extract_sentence_positive = list(sent_to_words(extract_sentence_positive))
print(extract_sentence_positive)

[['did', 'like', 'steven', 'or', 'stephen', 'listened', 'to', 'the', 'book'], ['the', 'plot', 'is', 'quick', 'moving', 'and', 'the', 'action', 'is', 'violent'], ['loved', 'everything', 'about', 'this', 'book'], ['great', 'quick', 'read'], ['although', 'there', 'isn', 'character', 'development', 'as', 'is', 'the', 'case', 'with', 'most', 'mystery', 'novels', 'yet', 'the', 'reader', 'comes', 'out', 'enlightened', 'about', 'several', 'notions', 'and', 'ideas', 'in', 'life'], ['liked', 'the', 'ending'], ['believe', 'barnes', 'delivers', 'on', 'that', 'promise', 'in', 'this', 'book', 'the', 'garden', 'of', 'stones', 'was', 'well', 'worth', 'the', 'read'], ['and', 'if', 'you', 'finish', 'pig', 'island', 'and', 'really', 'like', 'the', 'ending', 'read', 'hanging', 'hill'], ['the', 'narrator', 'is', 'good', 'and', 'the', 'audiobook', 'is', 'only', 'about', 'seven', 'hours'], ['great', 'story', 'of', 'girl', 'friendship', 'with', 'her', 'dog'], ['have', 'read', 'all', 'her', 'books', 'and', 'ad

In [49]:
# Build the bigram models for positive 
#code from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel
bigram = gensim.models.Phrases(extract_sentence_positive, min_count=5, threshold=100) 


bigram_mod = gensim.models.phrases.Phraser(bigram)


print(bigram_mod[extract_sentence_positive[0]])


['did', 'like', 'steven', 'or', 'stephen', 'listened', 'to', 'the', 'book']


In [50]:
#code from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out



In [51]:
#code from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel
stop_words = stopwords.words('english')
data_words_nostops_pos = remove_stopwords(extract_sentence_positive)
data_words_bigrams = make_bigrams(data_words_nostops_pos)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_pos = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:5])
print(data_lemmatized_pos[:5])

[['like', 'steven', 'stephen', 'listened', 'book'], ['plot', 'quick', 'moving', 'action', 'violent'], ['loved', 'everything', 'book'], ['great', 'quick', 'read'], ['although', 'character', 'development', 'case', 'mystery', 'novels', 'yet', 'reader', 'comes', 'enlightened', 'several', 'notions', 'ideas', 'life']]
[['listen', 'book'], ['plot', 'quick', 'move', 'action', 'violent'], ['love', 'book'], ['great', 'quick', 'read'], ['character', 'development', 'case', 'mystery', 'novel', 'reader', 'come', 'enlighten', 'several', 'notion', 'idea', 'life']]


In [52]:
id2word_pos = corpora.Dictionary(data_lemmatized_pos)
texts_pos = data_lemmatized_pos
corpus_pos = [id2word_pos.doc2bow(text) for text in texts_pos]


In [53]:
lda_model_pos = gensim.models.ldamodel.LdaModel(
   corpus=corpus_pos, id2word=id2word_pos,num_topics=5,  random_state=42, #num_topics is 5 because this is the most amount with non-overlapping topics 
   update_every=1, chunksize=20, passes=10, alpha='auto', per_word_topics=True
)
pprint(lda_model_pos.print_topics())
doc_lda = lda_model_pos[corpus_pos]

[(0,
  '0.045*"love" + 0.039*"story" + 0.024*"make" + 0.020*"voice" + 0.016*"time" '
  '+ 0.013*"meet" + 0.013*"actually" + 0.013*"girl" + 0.012*"begin" + '
  '0.010*"ride"'),
 (1,
  '0.061*"read" + 0.040*"book" + 0.025*"fun" + 0.022*"great" + 0.015*"look" + '
  '0.014*"thriller" + 0.012*"hero" + 0.012*"many" + 0.011*"go" + 0.011*"try"'),
 (2,
  '0.054*"character" + 0.021*"like" + 0.019*"get" + 0.018*"way" + '
  '0.016*"writer" + 0.016*"star" + 0.013*"fact" + 0.011*"next" + 0.011*"thick" '
  '+ 0.011*"use"'),
 (3,
  '0.050*"book" + 0.048*"well" + 0.029*"read" + 0.025*"really" + 0.023*"think" '
  '+ 0.022*"enjoy" + 0.018*"much" + 0.017*"develop" + 0.016*"mystery" + '
  '0.016*"reader"'),
 (4,
  '0.034*"good" + 0.030*"take" + 0.025*"novel" + 0.017*"keep" + 0.016*"notice" '
  '+ 0.014*"new" + 0.014*"interesting" + 0.013*"first" + 0.013*"dark" + '
  '0.013*"audiobook"')]


In [54]:
#Compute Perplexity
print('\nPerplexity: ', lda_model_pos.log_perplexity(corpus_pos))  # a measure of how good the topic model is. lower the better.

# Compute Coherence Score to judge how good a topic model is
coherence_model_lda_pos = CoherenceModel(model=lda_model_pos, texts=data_lemmatized_pos, dictionary=id2word_pos, coherence='c_v')
coherence_lda_pos = coherence_model_lda_pos.get_coherence()
print('\nCoherence Score: ', coherence_lda_pos)


Perplexity:  -7.006104014728797

Coherence Score:  0.5663075204304808


In [55]:
pyLDAvis.enable_notebook()
vis_pos = pyLDAvis.gensim_models.prepare(lda_model_pos, corpus_pos, id2word_pos)
vis_pos

## Topic Modelling Negative Sentiments  

In [56]:
extract_sentence_neg = dataset.loc[dataset['score']==-1]['sentence'].tolist()
# Removing all not alphanumeric charecters using the regular expression method 
# from https://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
extract_sentence_neg = [re.sub(r'[^\w]', ' ', sent) for sent in extract_sentence_neg]
extract_sentence_neg = list(sent_to_words(extract_sentence_neg))
print(extract_sentence_neg)

[['mild', 'because', 'it', 'isn', 'conclusive', 'and', 'doesn', 'give', 'us', 'the', 'information', 'we', 'need', 'to', 'fully', 'appreciate', 'the', 'story', 'that', 'lloyd', 'is', 'telling'], ['going', 'in', 'really', 'liked', 'it', 'but', 'unfortunately', 'left', 'me', 'bored', 'and', 'infuriated', 'at', 'the', 'end', 'which', 'think', 'is', 'really', 'hard', 'to', 'do', 'this', 'is', 'the', 'first', 'book', 'finished', 'only', 'because', 'wanted', 'it', 'over', 'with'], ['not', 'giving', 'this', 'stars', 'because', 'the', 'big', 'reveal', 'kind', 'of', 'disappointed', 'me', 'in', 'that', 'it', 'classic', 'case', 'of', 'possible', 'but', 'not', 'really', 'probable'], ['eh', 'hate', 'how', 'the', 'author', 'made', 'duke', 'from', 'nice', 'guy', 'to', 'complete', 'dick', 'wad', 'and', 'trevor', 'into', 'mr'], ['guess', 'didn', 'track', 'this', 'on', 'goodreads', 'when', 'first', 'read', 'this', 'so', 'll', 'just', 'rate', 'it', 'stars', 'but', 'have', 'to', 'reread', 'it'], ['but', 'i

In [57]:
#code from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel
#build trigram models for negative sentiments 
trigram = gensim.models.Phrases(extract_sentence_neg, threshold=100)

# Faster way to get a sentence clubbed as a trigram
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[extract_sentence_positive[0]]])

['did', 'like', 'steven', 'or', 'stephen', 'listened', 'to', 'the', 'book']


In [58]:
# Define the trigram function
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [59]:
data_words_nostops_neg = remove_stopwords(extract_sentence_neg)
data_words_trigrams = make_trigrams(data_words_nostops_neg)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_neg = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_trigrams[:5])
print(data_lemmatized_neg[:5])

[['mild', 'conclusive', 'give', 'us', 'information', 'need', 'fully', 'appreciate', 'story', 'lloyd', 'telling'], ['going', 'really', 'liked', 'unfortunately', 'left', 'bored', 'infuriated', 'end', 'think', 'really', 'hard', 'first', 'book', 'finished', 'wanted'], ['giving', 'stars', 'big', 'reveal', 'kind', 'disappointed', 'classic', 'case', 'possible', 'really', 'probable'], ['eh', 'hate', 'author', 'made', 'duke', 'nice', 'guy', 'complete', 'dick', 'wad', 'trevor', 'mr'], ['guess', 'track', 'goodreads', 'first', 'read', 'rate', 'stars', 'reread']]
[['mild', 'conclusive', 'give', 'information', 'need', 'fully', 'appreciate', 'story', 'lloyd', 'tell'], ['go', 'really', 'like', 'unfortunately', 'leave', 'bored', 'infuriated', 'end', 'think', 'really', 'hard', 'first', 'book', 'finish', 'want'], ['give', 'star', 'big', 'reveal', 'kind', 'disappoint', 'classic', 'case', 'possible', 'really', 'probable'], ['hate', 'author', 'make', 'nice', 'guy', 'trevor'], ['guess', 'track', 'goodread', 

In [60]:
id2word_neg = corpora.Dictionary(data_words_nostops_neg)
texts_neg = data_words_nostops_neg
corpus_neg = [id2word_neg.doc2bow(text) for text in texts_neg]

In [61]:
lda_model_neg = gensim.models.ldamodel.LdaModel(
   corpus=corpus_neg, id2word=id2word_neg,num_topics=3,  random_state=42, #num_topics is 5 because this is the most amount with non-overlapping topics 
   update_every=1, chunksize=20, passes=10, alpha='auto', per_word_topics=True
)
pprint(lda_model_neg.print_topics())
doc_lda = lda_model_neg[corpus_neg]

[(0,
  '0.013*"made" + 0.012*"characters" + 0.011*"felt" + 0.010*"get" + '
  '0.009*"time" + 0.009*"cheesier" + 0.009*"good" + 0.008*"something" + '
  '0.008*"like" + 0.007*"every"'),
 (1,
  '0.019*"book" + 0.011*"disappointed" + 0.009*"feel" + 0.009*"like" + '
  '0.008*"pages" + 0.008*"though" + 0.008*"stars" + 0.008*"scene" + '
  '0.008*"even" + 0.007*"got"'),
 (2,
  '0.036*"book" + 0.034*"first" + 0.030*"part" + 0.030*"little" + '
  '0.029*"writing" + 0.029*"sequel" + 0.029*"unfortunately" + 0.028*"certain" '
  '+ 0.028*"flaw" + 0.028*"ignored"')]


In [62]:
#Compute Perplexity
print('\nPerplexity: ', lda_model_neg.log_perplexity(corpus_neg))  # a measure of how good the model is. lower the better.

# Compute Coherence Score to judge how good the topic model is 
coherence_model_lda_neg = CoherenceModel(model=lda_model_neg, texts=data_words_nostops_neg, dictionary=id2word_neg, coherence='c_v')
coherence_lda_neg = coherence_model_lda_neg.get_coherence()
print('\nCoherence Score: ', coherence_lda_neg)


Perplexity:  -6.676255399802769

Coherence Score:  0.5382328175723966


In [63]:
pyLDAvis.enable_notebook()
vis_neg = pyLDAvis.gensim_models.prepare(lda_model_neg, corpus_neg, id2word_neg)
vis_neg