<a href="https://colab.research.google.com/github/shivani1912/Healthcare_NLP/blob/master/TopicModeling_on_Healthcare_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Objective: Find the products and reasons behind negative reviews

In [None]:
import json
import pandas as pd
%matplotlib inline
import tqdm
from tqdm.notebook import tqdm
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
reviews = []
with open('Health_and_Personal_Care_5.json', encoding="UTF-8") as fp:
    for line in fp:
        rev = json.loads(line)
        reviews.append(rev)

review = pd.DataFrame(reviews)

In [None]:
review.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,1294185600,"01 5, 2011"
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4.0,Small & may need to encourage battery,1329523200,"02 18, 2012"
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4.0,Very good but not great,1275955200,"06 8, 2010"
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4.0,great addition to your purse,1202428800,"02 8, 2008"
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5.0,Very nice and convenient.,1313452800,"08 16, 2011"


In [None]:
#reviews with the rating=1 are tagged as negative reviews
negative_reviews=review[review['overall']==1]

In [None]:
negative_review_count=pd.DataFrame(negative_reviews.groupby('asin').size())
negative_review_count['count']=negative_review_count[0]
negative_review_count=negative_review_count.drop([0],axis=1)

In [None]:
top_negative_reviews=negative_review_count.sort_values(by=['count'], ascending=False).head(int(len(negative_review_count)*0.001))

In [None]:
top_negative_reviews

Unnamed: 0_level_0,count
asin,Unnamed: 1_level_1
B0026HDURA,48
B00B5H5BGA,42
B001ADL1SG,42
B009VUZJTM,41
B000GP1O4U,37
B002BH4VY6,33
B003KVO0GQ,31


In [None]:
tp_negative_reviews=pd.merge(negative_reviews, top_negative_reviews, on='asin', how='inner')


# Review Text: Data Cleaning, Tokenization & Stop words removal

In [None]:
#NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

    

In [None]:
import re

# Convert to list
data = tp_negative_reviews.reviewText.values.tolist()


# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


In [None]:
len(data)

274

In [None]:
#tokenizing & removing punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))



In [None]:
len(data_words)

274

# Review Text: Lemmatization & Stop Words removal

In [None]:
#remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=["tagger", "parser"])
def lemmatization(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc ])
    return texts_out

In [None]:
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops)

print(data_lemmatized[1:5])

[['open', 'box', 'amazon', 'quickly', 'double', 'take', 'tiny', 'plastic', 'container', 'really', 'call', 'physician', 'care', 'office', '\ufeff1', 'aid', 'kit', 'people', 'really', 'twenty', 'five', 'people', 'long', 'back', 'website', 'look', 'numb', 'similarly', 'bemuse', 'occasionally', 'hilarious', 'review', 'people', 'like', 'find', 'little', 'plastic', 'kit', 'little', 'good', 'lack', 'every', 'regard', 'short', 'overpriced', 'kit', 'may', 'find', 'somewhat', 'useful', 'car', 'truck', 'especially', 'little', 'kid', 'prone', 'get', 'small', 'easily', 'clean', 'boo', 'boo', 'beyond', 'product', 'offer', 'virtually', 'nothing', 'get', 'count', 'little', 'alcohol', 'pad', 'small', 'bandage', 'couple', 'packet', 'cream', 'gauze', 'pad', 'inexplicably', 'deal', 'mountain', 'surgical', 'supply', 'get', 'tiny', 'little', 'tweezer', 'scissor', 'wait', 'ten', 'pair', 'rubber', 'glove', 'can', 'not', 'even', 'imagine', 'sort', 'injury', 'problem', 'people', 'may', 'consider', 'kit', 'one',

In [None]:
len(data_lemmatized)

274

# Topic Modeling

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 3), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1)]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.042*"weight" + 0.041*"take" + 0.021*"lose" + 0.021*"pill" + 0.020*"loss" '
  '+ 0.017*"eat" + 0.016*"week" + 0.014*"appetite" + 0.013*"diet" + '
  '0.011*"pound"'),
 (1,
  '0.014*"cream" + 0.010*"elbow" + 0.010*"help" + 0.009*"group" + '
  '0.008*"tendon" + 0.007*"oil" + 0.006*"prevent" + 0.006*"snake" + '
  '0.006*"muscle" + 0.006*"money"'),
 (2,
  '0.025*"not" + 0.017*"do" + 0.014*"get" + 0.012*"use" + 0.012*"work" + '
  '0.011*"like" + 0.010*"would" + 0.009*"go" + 0.009*"buy" + 0.009*"thing"'),
 (3,
  '0.058*"dry" + 0.051*"bar" + 0.029*"clothe" + 0.019*"load" + 0.019*"sheet" + '
  '0.016*"bounce" + 0.011*"use" + 0.010*"softener" + 0.010*"stick" + '
  '0.009*"white"'),
 (4,
  '0.042*"kit" + 0.032*"aid" + 0.023*"\ufeff1" + 0.013*"bandage" + '
  '0.012*"people" + 0.012*"would" + 0.010*"plastic" + 0.009*"glove" + '
  '0.009*"pad" + 0.008*"one"'),
 (5,
  '0.026*"product" + 0.017*"-PRON-" + 0.017*"not" + 0.014*"one" + '
  '0.014*"review" + 0.013*"use" + 0.012*"be" + 0.011*"work"

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Themes identified through topic modeling:
* Diet pills are perceived to be ineffective
* Quality of medical instruments is not upto the mark
* Laundry detergent is ineffective
