In [8]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = None
pd.options.display.max_rows = None

pd.options.display.max_colwidth=-1

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
import nltk
nltk.download("stopwords")


%autoreload 2

# Visualizations
import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns
import re


import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudeeshswaroopsahu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

UsageError: Line magic function `%autoreload` not found.


In [9]:
df=pd.read_csv("oneplusreview.csv" ,encoding='utf-8')

In [10]:
df.head()

Unnamed: 0,reviews
0,"This phone has great performance, good camera and ambiance. Charging is fast as per the Oneplus standards set in earlier phones. What is concerning me is the heating issue which crops up off and on. The Battery drains quickly so we have to keep it at 60Hz to longer life. Is there anyone else feeling the heating issues?\n"
1,Detailed review --Design - 4/5+Reduced notch+Premium glass back-Camera bumpPerformance - 5/5+Latest and greatest hardware+Smart optimisationsOS - 4.5/5+Smooth Android experience - Oxygen OSDisplay - 4.5/5+Butter smooth 90 Hz Amoled display - treat to the eyeBattery -3.5/5+Fastest charging ever seen-Less capacityCamera -4/5+Overall good output with all 3 cameras+ Amazing macro mode- Misses details and messes up with Colors sometimesAudio - 4/5+Clear and loud - quality output\n
2,Awesome experience..best in class camera... oxygen os..love\n
3,No earphone jack\n
4,"A One plus user.Oneplus One -> Oneplus 3T -> Oneplus 7T.I don't like other android phones due it's UI, love stock android.Camara is awesomeFacing problem with notifications, don't know what at top.I was expecting great battery but its average.As i moved from 3T to 7T, I had to buy wireless headphones and was worried about this. But Bluetooth connection is also good/stable.\n"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12803 entries, 0 to 12802
Data columns (total 1 columns):
reviews    12803 non-null object
dtypes: object(1)
memory usage: 100.1+ KB


In [12]:
df['reviews'][0]

'This phone has great performance, good camera and ambiance. Charging is fast as per the Oneplus standards set in earlier phones. What is concerning me is the heating issue which crops up off and on. The Battery drains quickly so we have to keep it at 60Hz to longer life. Is there anyone else feeling the heating issues?\n'

Tokenization

In [13]:
tokenizer = RegexpTokenizer(r'\w+')

In [14]:
doc_1 = df.reviews[0]

In [15]:
tokens = tokenizer.tokenize(doc_1.lower())


In [16]:
print('{} characters in string vs {} words in a list'.format(len(doc_1),                                                             len(tokens)))
print(tokens[:10])

321 characters in string vs 58 words in a list
['this', 'phone', 'has', 'great', 'performance', 'good', 'camera', 'and', 'ambiance', 'charging']


Stop Words

In [17]:
nltk_stpwd = stopwords.words('english')

In [18]:
print(len(set(nltk_stpwd)))
print(nltk_stpwd[:10])

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [19]:
stopped_tokens = [token for token in tokens if not token in nltk_stpwd]
print(stopped_tokens[:10])

['phone', 'great', 'performance', 'good', 'camera', 'ambiance', 'charging', 'fast', 'per', 'oneplus']


Stemming

In [20]:
sb_stemmer = SnowballStemmer('english')

In [21]:
stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
print(stemmed_tokens)


['phone', 'great', 'perform', 'good', 'camera', 'ambianc', 'charg', 'fast', 'per', 'oneplus', 'standard', 'set', 'earlier', 'phone', 'concern', 'heat', 'issu', 'crop', 'batteri', 'drain', 'quick', 'keep', '60hz', 'longer', 'life', 'anyon', 'els', 'feel', 'heat', 'issu']


In [25]:
num_reviews = df.shape[0]
doc_set = [df.reviews[i] for i in range(num_reviews)]

In [26]:
texts = []

In [29]:
for doc in doc_set:
    tokens = tokenizer.tokenize(doc.lower())
    stopped_tokens = [token for token in tokens if not token in nltk_stpwd]
    stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
    texts.append(stemmed_tokens)
    
print(texts[1])

['detail', 'review', 'design', '4', '5', 'reduc', 'notch', 'premium', 'glass', 'back', 'camera', 'bumpperform', '5', '5', 'latest', 'greatest', 'hardwar', 'smart', 'optimisationso', '4', '5', '5', 'smooth', 'android', 'experi', 'oxygen', 'osdisplay', '4', '5', '5', 'butter', 'smooth', '90', 'hz', 'amol', 'display', 'treat', 'eyebatteri', '3', '5', '5', 'fastest', 'charg', 'ever', 'seen', 'less', 'capacitycamera', '4', '5', 'overal', 'good', 'output', '3', 'camera', 'amaz', 'macro', 'mode', 'miss', 'detail', 'mess', 'color', 'sometimesaudio', '4', '5', 'clear', 'loud', 'qualiti', 'output']


Create a dictionary using corpora

In [30]:
texts_dict = corpora.Dictionary(texts)
texts_dict.save('elec_review.dict') 
print(texts_dict)

Dictionary(11998 unique tokens: ['60hz', 'ambianc', 'anyon', 'batteri', 'camera']...)


In [32]:
import operator
print("IDs 1 through 10: {}".format(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10]))


IDs 1 through 10: [('60hz', 0), ('ambianc', 1), ('anyon', 2), ('batteri', 3), ('camera', 4), ('charg', 5), ('concern', 6), ('crop', 7), ('drain', 8), ('earlier', 9)]


In [34]:
print(df.reviews.str.contains("complaint").value_counts()) 
print(df.reviews.str.contains("lot").value_counts())


False    12612
True     191  
Name: reviews, dtype: int64
False    11754
True     1049 
Name: reviews, dtype: int64


In [35]:
texts_dict.filter_extremes(no_below=30, no_above=0.15) # inlace filter
print(texts_dict)
print("top terms:")
print(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10])

Dictionary(2269 unique tokens: ['60hz', 'anyon', 'concern', 'crop', 'drain']...)
top terms:
[('60hz', 0), ('anyon', 1), ('concern', 2), ('crop', 3), ('drain', 4), ('earlier', 5), ('els', 6), ('feel', 7), ('heat', 8), ('issu', 9)]


Creating Bag of Words 

In [36]:
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)


25606

In [37]:
%%time 
# Matrix Market format https://radimrehurek.com/gensim/corpora/mmcorpus.html, why exactly? I don't know
gensim.corpora.MmCorpus.serialize('oneplusreviews.mm', corpus)


CPU times: user 1.79 s, sys: 187 ms, total: 1.98 s
Wall time: 2.2 s


Training an LDA model

In [42]:
lda_model = gensim.models.LdaModel(corpus,alpha='auto', num_topics=5,id2word=texts_dict, passes=20)#Choosing the number of topics based on various categories of electronics on Amazon
lda_model.show_topics(num_topics=5,num_words=5)
raw_query = 'one plus mobiles'
query_words = raw_query.split()
query = []
for word in query_words:
    # ad-hoc reuse steps from above
    q_tokens = tokenizer.tokenize(word.lower())
    q_stopped_tokens = [word for word in q_tokens if not word in nltk_stpwd]
    q_stemmed_tokens = [sb_stemmer.stem(word) for word in q_stopped_tokens]
    query.append(q_stemmed_tokens[0])
    
print(query)# Words in query will be converted to ids and frequencies  
id2word = gensim.corpora.Dictionary()
_ = id2word.merge_with(texts_dict) # garbage# Convert this document into (word, frequency) pairs
query = id2word.doc2bow(query)
print(query)#Create a sorted list
sorted_list = list(sorted(lda_model[query], key=lambda x: x[1]))
sorted_list#Assessing least related topics
lda_model.print_topic(sorted_list[0][0]) #least related#Assessing most related topics
lda_model.print_topic(sorted_list[-1][0]) #most related

[(0,
  '0.014*"10" + 0.013*"mode" + 0.011*"video" + 0.010*"review" + 0.010*"pro"'),
 (1,
  '0.017*"amaz" + 0.017*"smooth" + 0.013*"game" + 0.012*"love" + 0.011*"speed"'),
 (2,
  '0.014*"go" + 0.012*"iphon" + 0.012*"pro" + 0.012*"year" + 0.010*"samsung"'),
 (3,
  '0.011*"1" + 0.009*"fingerprint" + 0.009*"2" + 0.009*"6" + 0.008*"devic"'),
 (4,
  '0.024*"issu" + 0.016*"amazon" + 0.016*"call" + 0.014*"problem" + 0.012*"mobil"')]

['one', 'plus', 'mobil']
[(277, 1)]


[(0, 0.06031103),
 (4, 0.07687535),
 (3, 0.08080111),
 (2, 0.09454638),
 (1, 0.68746614)]

'0.014*"10" + 0.013*"mode" + 0.011*"video" + 0.010*"review" + 0.010*"pro" + 0.009*"take" + 0.008*"realli" + 0.008*"even" + 0.008*"light" + 0.008*"night"'

'0.017*"amaz" + 0.017*"smooth" + 0.013*"game" + 0.012*"love" + 0.011*"speed" + 0.011*"mobil" + 0.010*"jack" + 0.010*"overal" + 0.010*"experi" + 0.010*"super"'

Above are the top 5 words associated with 1 topic. The float next to each word is the weight showing how much the given word influences this specific topic. We can interpret that here the topic might be close to oneplus phone category