In [None]:
import numpy as np, pandas as pd
import re, random, os, string

import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
#Task1 - Read the .csv file using Pandas. Take a look at the top few records.

topicReviews = pd.read_csv("sample_data/K8 Reviews v0.2.csv")
topicReviews.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [None]:
#Task2 - Normalize casings for the review text and extract the text into a list for easier manipulation.

lowerCaseReviews = [rev.lower() for rev in topicReviews.review.values]

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Task3 - Tokenize the reviews using NLTKs word_tokenize function.

tokenOfReviews = [word_tokenize(lowerRev) for lowerRev in lowerCaseReviews]
tokenOfReviews[0]

['good', 'but', 'need', 'updates', 'and', 'improvements']

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Task4 - Perform parts-of-speech tagging on each sentence using the NLTK POS tagger

taggedReviews = [nltk.pos_tag(revTag) for revTag in tokenOfReviews]
taggedReviews[0]

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [None]:
#Task5 - 1 - Find out all the POS tags that correspond to nouns

nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [None]:
#Task5 - 2 - Limit the data to only terms with these tags.

nounInReviews=[]
for sent in taggedReviews:
    nounInReviews.append([token for token in sent if re.search("NN.*", token[1])])

In [None]:
#Task6 - Lemmentize - Different forms of the terms need to be treated as one.

lemmatizr = WordNetLemmatizer()
lemmentizedReviews = []
for sent in nounInReviews:
    lemmentizedReviews.append([lemmatizr.lemmatize(word[0]) for word in sent])

In [None]:
#Task7 - Remove stopwords and punctuation (if there are any). 

from string import punctuation
from nltk.corpus import stopwords

englishStopWords = stopwords.words("english")

In [None]:
stopWordsUpdated = englishStopWords + list(punctuation) + ["..."] + [".."]
removedStopWrdsFromReview = []
for sent in lemmentizedReviews:
    removedStopWrdsFromReview.append([term for term in sent if term not in stopWordsUpdated])

removedStopWrdsFromReview[0]

['update', 'improvement']

In [None]:
#Task 8 - Create a topic model using LDA on the cleaned up data with 12 topics.
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import ldamodel

In [None]:
idToWord = corpora.Dictionary(removedStopWrdsFromReview)
texts = removedStopWrdsFromReview
corpus = [idToWord.doc2bow(text) for text in texts]

In [None]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=idToWord,
                                           num_topics=12, 
                                           random_state=42,
                                           passes=10,
                                           per_word_topics=True)

In [None]:
#Task 8 - 1- Print out the top terms for each topic.
pprint(ldaModel.print_topics())

[(0,
  '0.155*"mobile" + 0.035*"screen" + 0.030*"call" + 0.027*"video" + '
  '0.026*"option" + 0.024*"feature" + 0.018*"music" + 0.017*"app" + '
  '0.017*"cast" + 0.015*"speed"'),
 (1,
  '0.051*"delivery" + 0.039*"superb" + 0.038*"glass" + 0.037*"h" + '
  '0.026*"device" + 0.023*"everything" + 0.021*"super" + 0.020*"gorilla" + '
  '0.018*"cost" + 0.018*"ok"'),
 (2,
  '0.140*"note" + 0.085*"lenovo" + 0.073*"k8" + 0.023*"phone" + 0.017*"system" '
  '+ 0.016*"model" + 0.013*"device" + 0.010*"version" + 0.009*"k4" + '
  '0.008*"power"'),
 (3,
  '0.161*"problem" + 0.086*"battery" + 0.084*"...." + 0.079*"performance" + '
  '0.077*"heating" + 0.062*"phone" + 0.035*"....." + 0.032*"camera" + '
  '0.030*"issue" + 0.014*"backup"'),
 (4,
  '0.160*"battery" + 0.047*"charger" + 0.041*"hour" + 0.041*"phone" + '
  '0.033*"backup" + 0.030*"charge" + 0.030*"day" + 0.029*"heat" + 0.023*"hai" '
  '+ 0.022*"charging"'),
 (5,
  '0.099*"price" + 0.087*"money" + 0.052*"value" + 0.049*"handset" + '
  '0.043*"

In [None]:
# Task8 - 2- What is the coherence of the model with the c_v metric?

coherenceModelLda = CoherenceModel(model=ldaModel, texts=removedStopWrdsFromReview, dictionary=idToWord, coherence='c_v')
coherenceLda = coherenceModelLda.get_coherence()
print('Coherence Score is: ', coherenceLda)

Coherence Score is:  0.5571936650478105


In [None]:
#Task9 - Determine which of the topics can be combined.

print('Topic 2 and 5 talk about pricing')

print('Topic 4, 6 and 10 talk about battery related issues')

print('Topic 3 and 11 probably talk about performance')

Topic 2 and 5 talk about pricing
Topic 4, 6 and 10 talk about battery related issues
Topic 3 and 11 probably talk about performance


In [None]:
#Task10 - Create topic model using LDA with what you think is the optimal number of topics
ldaModel9 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=idToWord,
                                           num_topics=9, 
                                           random_state=42,
                                           passes=10,
                                           per_word_topics=True)

In [None]:
#Task10 - What is the coherence of the model?
coherenceModel9Lda = CoherenceModel(model=ldaModel9, texts=removedStopWrdsFromReview, dictionary=idToWord, coherence='c_v')
coherenceLdaM9 = coherenceModel9Lda.get_coherence()
print(' Coherence Score for 9 topics: ', coherenceLdaM9)

 Coherence Score for 9 topics:  0.5403311642904353


In [None]:
#Task11 - The business should  be able to interpret the topics. Name each of the identified topics.

topcLst = ldaModel9.show_topics(formatted=False)
WordsInTopics = [(topc[0], [wrd[0] for wrd in topc[1]]) for topc in topcLst]

In [None]:
#Task11 - Create a table with the topic name and the top 10 terms in each to present to the  business.

for topic,words in WordsInTopics:
    print(f'Topic {topic} has words :: {words}')

Topic 0 has words :: ['mobile', 'screen', 'feature', 'call', 'option', 'video', 'music', 'app', 'apps', 'cast']
Topic 1 has words :: ['delivery', 'return', 'glass', 'h', 'amazon', 'sim', 'policy', 'super', 'gorilla', 'card']
Topic 2 has words :: ['phone', 'note', 'lenovo', 'k8', 'time', 'service', 'issue', 'problem', 'network', 'day']
Topic 3 has words :: ['problem', 'battery', 'issue', 'phone', 'heating', 'performance', 'camera', 'update', 'drain', 'backup']
Topic 4 has words :: ['battery', 'phone', 'charger', 'hour', 'backup', 'charge', 'heat', 'charging', 'turbo', 'hr']
Topic 5 has words :: ['product', 'money', 'waste', 'value', 'handset', 'price', 'amazon', 'lenovo', '....', 'plz']
Topic 6 has words :: ['speaker', 'superb', 'sound', 'display', '.....', 'dolby', '......', 'set', 'atmos', 'work']
Topic 7 has words :: ['phone', 'camera', 'price', 'quality', 'feature', 'range', 'battery', 'performance', 'mode', 'processor']
Topic 8 has words :: ['camera', 'quality', '....', 'battery', 