## Bring in Dataframe - This dataframe contains all Bazar Voice and Brandwatch (2018/2019/2020) and has SA and Product Collection Classification Applied

In [1]:
# load libraries
import pandas as pd
import nltk # https://www.nltk.org/ ; nltk helps with tokenization, stopwords dictionary, lemmatization
import re # https://docs.python.org/3/library/re.html ; re is used for regular expressions
import numpy as np

All = pd.read_csv("All_Clean.csv", header = 0, engine='python')
All.head()

Unnamed: 0,Brand,Date,Product_category,Review_clean,Review_original,Sentiment,Source
0,Listerine,1/1/2018,tartar control mouthwash collection,listerine could ever use worstera bestbreath,RT @KeithOlbermann But...all the Listerine the...,1,twitter
1,Listerine,1/1/2018,zero alcohol-free mouthwash collection,recommended dental hygienist burn mouth tastes...,Recommended by my dental hygienist. Doesn't bu...,1,review
2,Listerine,1/1/2018,floss products,similar example think red shoe sole louboutin ...,The most similar example I can think of is the...,1,reddit
3,Listerine,1/1/2018,zero alcohol-free mouthwash collection,gargle really warm salt water hit enough burn ...,@Zellyanks Gargle with really warm salt water....,1,twitter
4,Listerine,1/1/2018,sensitivity,lice hate smell tea tree oil lice shampoo easi...,Lice hate the smell of tea tree oil. The lice ...,1,forum


In [2]:
All.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248918 entries, 0 to 248917
Data columns (total 7 columns):
Brand               248918 non-null object
Date                248918 non-null object
Product_category    82887 non-null object
Review_clean        248917 non-null object
Review_original     248918 non-null object
Sentiment           248918 non-null int64
Source              248875 non-null object
dtypes: int64(1), object(6)
memory usage: 13.3+ MB


In [3]:
# needed to ensure type string
All.Review_clean = All.Review_clean.astype(str)
All.Review_original = All.Review_original.astype(str)

In [4]:
All['Source'].value_counts()

twitter          129993
review            51355
forum             18309
tumblr            14134
news              12924
reddit            12684
listerine.com      8591
blog                885
Name: Source, dtype: int64

## Explore splitting out df into only reviews (i.e. no twitter, reddit, etc.)

In [5]:
# subset out tweets, as these are quite different than product revews. Product classification may not be well
# represented. Additionally, content will not lend itself well to usefull LDA and aspect tagging
actual_reviews = ['review', 'listerine.com'] 
Reviews = All[All.Source.isin(actual_reviews)]
Other = All[~All.Source.isin(actual_reviews)]

# Remove retweets (RT) in the Twitter df
# ID Retweet Reviews so df does not contain these duplicates. 
Other_noRT = Other[~Other.Review_original.str.contains("RT", na=False)]

In [6]:
# Check the size of the documents we cut out. ~ 189k
Other.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188972 entries, 0 to 248915
Data columns (total 7 columns):
Brand               188972 non-null object
Date                188972 non-null object
Product_category    69479 non-null object
Review_clean        188972 non-null object
Review_original     188972 non-null object
Sentiment           188972 non-null int64
Source              188929 non-null object
dtypes: int64(1), object(6)
memory usage: 11.5+ MB


In [7]:
# Of those, 140k are unique
Other_noRT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139509 entries, 2 to 248915
Data columns (total 7 columns):
Brand               139509 non-null object
Date                139509 non-null object
Product_category    50367 non-null object
Review_clean        139509 non-null object
Review_original     139509 non-null object
Sentiment           139509 non-null int64
Source              139466 non-null object
dtypes: int64(1), object(6)
memory usage: 8.5+ MB


In [8]:
# of those 50k are about Listerine
Other_noRT["Brand"].value_counts()

Listerine          50367
Oral-B             25264
Colgate            22605
Crest              16670
Sensodyne           7941
Philips             6309
ACT                 3479
Toms                3276
GUM                  766
Parodontax           501
Dr. Fresh            497
Therabreath          451
Natural Dentist      382
CloSYS               331
Nature's Answer      165
Eco-Dent             142
Cepacol              131
Desert Essence        98
Corsodyl              94
Mountain Falls        28
Jason Natural         12
Name: Brand, dtype: int64

In [9]:
# and this is what we lose per product collection on unique tweets
Other_noRT["Product_category"].value_counts()

antiseptic mouthwash collection           11482
on-the-go oral care products               9717
nightly reset                              7945
sensitivity                                5869
zero alcohol-free mouthwash collection     5471
tartar control mouthwash collection        3153
floss products                             2329
kids mouthwash collection                  2273
fluoride toothpaste collection              796
total care mouthwash collection             636
naturals mouthwash collection               599
teeth whitening mouthwash collection         97
Name: Product_category, dtype: int64

In [10]:
# Check out the Review df, start with 60k reviews
Reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59946 entries, 1 to 248917
Data columns (total 7 columns):
Brand               59946 non-null object
Date                59946 non-null object
Product_category    13408 non-null object
Review_clean        59946 non-null object
Review_original     59946 non-null object
Sentiment           59946 non-null int64
Source              59946 non-null object
dtypes: int64(1), object(6)
memory usage: 3.7+ MB


In [11]:
# check how reviews are distributed across page type
Reviews['Source'].value_counts()

review           51355
listerine.com     8591
Name: Source, dtype: int64

In [12]:
# of the reviews, 13k are for Listerine
Reviews['Brand'].value_counts()

Oral-B             24085
Listerine          13408
Crest               9198
Sensodyne           4548
Toms                3076
Therabreath         1832
Philips             1419
Parodontax          1134
Colgate              907
ACT                  205
GUM                   40
Dr. Fresh             33
Mountain Falls        20
Desert Essence        18
Jason Natural          7
Natural Dentist        6
CloSYS                 5
Cepacol                5
Name: Brand, dtype: int64

In [13]:
#Check the value count for each type of Product Category
#This is what we will use going forward
Reviews["Product_category"].value_counts()

on-the-go oral care products              4596
sensitivity                               1736
floss products                            1650
nightly reset                             1537
zero alcohol-free mouthwash collection    1359
antiseptic mouthwash collection           1087
kids mouthwash collection                  383
ultraclean tartar control mouthwash        343
tartar control mouthwash collection        244
total care mouthwash collection            205
fluoride toothpaste collection             172
naturals mouthwash collection               59
teeth whitening mouthwash collection        35
fluoride defenseÃƒÂ¢?Ã‚Â¢                    2
Name: Product_category, dtype: int64

### Conclusion - not enough data to support analysis. Will retain all going forward, minus retweets. 

In [14]:
All_noRT = All[~All.Review_original.str.contains("RT", na=False)]
All_noRT["Product_category"].value_counts()

on-the-go oral care products              14309
antiseptic mouthwash collection           12560
nightly reset                              9481
sensitivity                                7605
zero alcohol-free mouthwash collection     6830
floss products                             3976
tartar control mouthwash collection        3397
kids mouthwash collection                  2654
fluoride toothpaste collection              968
total care mouthwash collection             841
naturals mouthwash collection               658
ultraclean tartar control mouthwash         343
teeth whitening mouthwash collection        132
fluoride defenseÃƒÂ¢?Ã‚Â¢                     2
Name: Product_category, dtype: int64

# ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## ///////////////////////////////////////////////////////////////////////////////////////////

# pyLDAvis - LDA and Visualization

### Select Listerine Product Collection: "On-the-go oral care products (OTG)"

In [15]:
# subset the On the Go Product Category
All_OTG = All_noRT[All_noRT.Product_category == "on-the-go oral care products"]

In [16]:
# Select Date Range of Interest - 1st Quarter 2020
OTG_1Q_20 = All_OTG[(All_OTG['Date'] > '12/31/2019') & (All_OTG['Date'] < '4/1/2020')]

In [17]:
# subset positive and negative reviews. 
#positive
OTG_1Q_20_POS = OTG_1Q_20[OTG_1Q_20.Sentiment == 1]
#negative
OTG_1Q_20_NEG = OTG_1Q_20[OTG_1Q_20.Sentiment == 0]

In [18]:
OTG_1Q_20['Sentiment'].value_counts()

1    3699
0     560
Name: Sentiment, dtype: int64

### Develop LDA model and visualization

In [32]:
from __future__ import print_function

In [33]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [34]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### LDA for Positive Reviews 

In [35]:
# !!!! Run Stopword Code Below Under Next Section in order to build the "new_stopwords_list" !!!!

# both max and min_df can be tuned 
# max_df: ignore terms that have a document frequency strictly higher than the given threshold
# min_df: ignore terms that have a document frequency strictly lower than the given threshold

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = new_stopwords_list,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 15)
dtm_tf = tf_vectorizer.fit_transform(OTG_1Q_20_POS['Review_original'])
print(dtm_tf.shape)

(3699, 496)


In [36]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(OTG_1Q_20_POS['Review_original'])
print(dtm_tfidf.shape)



(3699, 496)


Fit Latent Dirichlet Allocation models

In [37]:
# n_components is parameter to select number of topics to identify

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [38]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### LDA for Negative Reviews 

In [39]:
# both max and min_df can be tuned 
# max_df: ignore terms that have a document frequency strictly higher than the given threshold
# min_df: ignore terms that have a document frequency strictly lower than the given threshold

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = new_stopwords_list,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 15)
dtm_tf = tf_vectorizer.fit_transform(OTG_1Q_20_NEG['Review_original'])
print(dtm_tf.shape)

(560, 45)


In [40]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(OTG_1Q_20_NEG['Review_original'])
print(dtm_tfidf.shape)

(560, 45)




Fit Latent Dirichlet Allocation models

In [41]:
# n_components is parameter to select number of topics to identify

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [42]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# /////////////////////////////////////////////////////////////////////////////////////////////////

# Standard LDA Method

# Nightly Reset Collection

In [45]:
# subset the desired Product Category. 
All_NR = All_noRT[All_noRT.Product_category == "nightly reset"]

In [46]:
# Select Date Range of Interest
All_NR = All_NR[(All_NR['Date'] > '4/01/2020') & (All_NR['Date'] < '6/30/2020')]

In [47]:
All_NR.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 131641 to 192290
Data columns (total 7 columns):
Brand               2415 non-null object
Date                2415 non-null object
Product_category    2415 non-null object
Review_clean        2415 non-null object
Review_original     2415 non-null object
Sentiment           2415 non-null int64
Source              2415 non-null object
dtypes: int64(1), object(6)
memory usage: 150.9+ KB


### Remove additional stopwords

In [48]:
# check word frequency
from collections import Counter
results = Counter()
All_NR["Review_clean"].str.split(' ').apply(results.update)
print(results)



In [49]:
#Remove stopwords
# download the stopwords dictionary
from nltk.corpus import stopwords
nltk.download('stopwords')

# save the list of stopwords in stop_words
stop_words = set(stopwords.words("english"))

# add extra words to the stopwords dictionary - "Listerine"
new_stopwords = ['listerine', 'mouthwash', 'used', 'use', 'using', 'products', 'johnson', 'like', 'bottle']
new_stopwords_list = stop_words.union(new_stopwords)

All_NR["Clean_noStopWords"] = All_NR["Review_clean"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in new_stopwords_list))
All_NR.head(2)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeTriumph's\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Brand,Date,Product_category,Review_clean,Review_original,Sentiment,Source,Clean_noStopWords
131641,Listerine,4/1/2018,nightly reset,quote originally posted lotus maiden aaw sweet...,Quote: Originally Posted by Lotus_Maiden ^ aaw...,1,forum,quote originally posted lotus maiden aaw sweet...
131648,Listerine,4/1/2018,nightly reset,gargle balls listerine minty fresh mouth wash,do you gargle t1's balls with listerine or oth...,1,reddit,gargle balls minty fresh mouth wash


### Split out positive and negative reviews

In [50]:
#positive
All_NR_pos = All_NR[All_NR.Sentiment == 1]
#negative
All_NR_neg = All_NR[All_NR.Sentiment == 0]
##Check the value count for each Sentiment for this Product Category
All_NR["Sentiment"].value_counts()

1    2052
0     363
Name: Sentiment, dtype: int64

### Bag of Words Transformation - Nightly Reset POS

In [51]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()
    
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(All_NR_pos['Clean_noStopWords'])

# Visualise the 10 most common words
#plot_10_most_common_words(count_data, count_vectorizer)

### LDA model training and results visualization - Nightly Reset POS

In [52]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 3
number_words = 5

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_model = lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

Topics found via LDA:

Topic #0:
teeth mouth day night good

Topic #1:
day really time said reed

Topic #2:
mouth breath bad alcohol morning


### Bag of Words Transformation - Nightly Reset NEG

In [53]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()
    
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(All_NR_neg['Clean_noStopWords'])

# Visualise the 10 most common words
#plot_10_most_common_words(count_data, count_vectorizer)

### LDA model training and results visualization - Nightly Reset NEG

In [54]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 4
number_words = 5

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda_model = lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

Topics found via LDA:

Topic #0:
calories term day bed total

Topic #1:
good bad wash breath really

Topic #2:
alcohol shampoo mouth really going

Topic #3:
night mouth day water week


# /////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# N-gram analysis for effect or topic of interest on review sentiment

## Focus on COVID related documents

### Check how POS taggings are attached to words in sentences


In [56]:
# find out the POS (Part of Speech) for each word in each sentence
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# selecting date range based on "post Covid", o/a Mar20
All_Covid = All_noRT[(All_noRT['Date'] > '3/01/2020') & (All_noRT['Date'] < '6/30/2020')]
# only interested in Listerine products for this analysis
All_Covid = All_Covid[(All_Covid['Brand'] == 'Listerine')]
All_Covid["pos"] = All_Covid['Review_clean'].apply(lambda x: pos_tag(word_tokenize(x)))
All_Covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25213 entries, 102538 to 192322
Data columns (total 8 columns):
Brand               25213 non-null object
Date                25213 non-null object
Product_category    25213 non-null object
Review_clean        25213 non-null object
Review_original     25213 non-null object
Sentiment           25213 non-null int64
Source              25211 non-null object
pos                 25213 non-null object
dtypes: int64(1), object(7)
memory usage: 1.7+ MB


In [57]:
# download the stopwords dictionary
from nltk.corpus import stopwords
nltk.download('stopwords')

# save the list of stopwords in stop_words
stop_words = set(stopwords.words("english"))

# remove the words that exist in the stopwords dictionary from tweets
All_Covid["no_stop_words"] = All_Covid["Review_clean"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
All_Covid.info()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DeTriumph's\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
Int64Index: 25213 entries, 102538 to 192322
Data columns (total 9 columns):
Brand               25213 non-null object
Date                25213 non-null object
Product_category    25213 non-null object
Review_clean        25213 non-null object
Review_original     25213 non-null object
Sentiment           25213 non-null int64
Source              25211 non-null object
pos                 25213 non-null object
no_stop_words       25213 non-null object
dtypes: int64(1), object(8)
memory usage: 1.9+ MB


##### Lemmatization while including POS tagging


In [58]:
# code taken from here: https://stackoverflow.com/questions/51267166/lemmatization-pandas-python
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



# Lemmatizing using POS (Part of Speech) tagging
All_Covid['lemmatized_sentences'] = All_Covid['no_stop_words'].apply(lambda x: lemmatize_sentence(x))
All_Covid.head(3)

Unnamed: 0,Brand,Date,Product_category,Review_clean,Review_original,Sentiment,Source,pos,no_stop_words,lemmatized_sentences
102538,Listerine,3/1/2018,zero alcohol-free mouthwash collection,got straight c one semester partying hard bro ...,Got straight C's one semester. Was partying to...,1,reddit,"[(got, VBD), (straight, JJ), (c, VB), (one, CD...",got straight c one semester partying hard bro ...,get straight c one semester party hard bro pri...
102539,Listerine,3/1/2018,nightly reset,way look like ross listerine guy yes,@JesseBWatters NO WAY do you look like Ross! L...,1,twitter,"[(way, NN), (look, VBP), (like, IN), (ross, NN...",way look like ross listerine guy yes,way look like ross listerine guy yes
102540,Listerine,3/1/2018,on-the-go oral care products,listerine uses fear bad breath sell product,@UNFMAR3023ÃƒÆ’Ã†â€™ÃƒÂ¢Ã¢â€šÂ¬Ã…Â¡ÃƒÆ’Ã¢â‚¬Å¡...,1,twitter,"[(listerine, NN), (uses, VBZ), (fear, VBP), (b...",listerine uses fear bad breath sell product,listerine use fear bad breath sell product


#### Frequent words
Find out the most frequent words that show up in the text data. Then, identify possible topics from these words.

In [59]:
# get the most frequent words
from collections import Counter
results = Counter()
All_Covid['lemmatized_sentences'].str.split(' ').apply(results.update)
print(results)



Here in the case of Covid, we're looking for key words to select to zero in on documents that have to do with the virus: virus, corona, covid, etc. 

Then, create a column for each of these words and in each column, populate it with a 3-gram which contains the respective word in the middle.

In [60]:
# create function that find n-grams that have the word of interest at index 2
from nltk.util import ngrams

def ngram_filter(doc, word, n):
    tokens = doc.split()
    all_ngrams = ngrams(tokens, n)
    filtered_ngrams = [x for x in all_ngrams if word in x[2]]  # [2] means that the n-grams that have the given word at index 2 (position 3) will be returned
    return filtered_ngrams

In [61]:
covid = All_Covid[All_Covid['Review_clean'].str.contains('covid|virus|corona|coronavirus|plague')]
covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591 entries, 103084 to 192285
Data columns (total 10 columns):
Brand                   591 non-null object
Date                    591 non-null object
Product_category        591 non-null object
Review_clean            591 non-null object
Review_original         591 non-null object
Sentiment               591 non-null int64
Source                  591 non-null object
pos                     591 non-null object
no_stop_words           591 non-null object
lemmatized_sentences    591 non-null object
dtypes: int64(1), object(9)
memory usage: 50.8+ KB


In [62]:
# create a column for each topic. Use the function defined above to find out 5-grams that contain the topic word in the middle. 
# we're using the original text for this, not the clean version.
# transform the data to str so that Vader can be applied to it

covid['covid'] = covid['Review_clean'].apply(lambda row: (ngram_filter(row,'covid', 3))).astype(str)
print(covid['covid'])

103084                                 []
103139                                 []
103361                                 []
103461                                 []
104050                                 []
                       ...               
184950                                 []
186583                                 []
187483    [('mild', 'headache', 'covid')]
192225                                 []
192285                                 []
Name: covid, Length: 591, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [63]:
covid['virus'] = covid['Review_clean'].apply(lambda row: (ngram_filter(row,'virus', 3))).astype(str)
print(covid['virus'])

103084                  [('action', 'bacteria', 'viruses')]
103139    [('many', 'realize', 'coronavirus'), ('coronav...
103361           [('practically', 'immune', 'coronavirus')]
103461                                                   []
104050                    [('certainly', 'works', 'virus')]
                                ...                        
184950                                                   []
186583                  [('squid', 'parasites', 'viruses')]
187483                                                   []
192225                      [('kills', 'germs', 'viruses')]
192285                [('nothing', 'worry', 'coronavirus')]
Name: virus, Length: 591, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [64]:
covid['corona'] = covid['Review_clean'].apply(lambda row: (ngram_filter(row,'corona', 3))).astype(str)
print(covid['corona'])

103084                                                   []
103139    [('many', 'realize', 'coronavirus'), ('people'...
103361           [('practically', 'immune', 'coronavirus')]
103461                       [('rye', 'whiskey', 'corona')]
104050                                                   []
                                ...                        
184950                                                   []
186583                                                   []
187483                                                   []
192225                                                   []
192285                [('nothing', 'worry', 'coronavirus')]
Name: corona, Length: 591, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Sentimen Analysis on n-grams

In [65]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

For each topic column, create new columns in which you store the positive/ negative/ neutral sentiments determined by Vader.

Notice that the rows that don't have n-grams for a particular topic get a neutral value of 1. When you summarise the data, remove those instances.

In [66]:
#data_listerine['taste_sent'] = data_listerine['taste'].apply(analyzer.polarity_scores)
#covid
covid['covid_neg'] = [analyzer.polarity_scores(x)['neg'] for x in covid['covid']]
covid['covid_pos'] = [analyzer.polarity_scores(x)['pos'] for x in covid['covid']]
covid['covid_neutral'] = [analyzer.polarity_scores(x)['neu'] for x in covid['covid']]

#virus
covid['virus_neg'] = [analyzer.polarity_scores(x)['neg'] for x in covid['virus']]
covid['virus_pos'] = [analyzer.polarity_scores(x)['pos'] for x in covid['virus']]
covid['virus_neutral'] = [analyzer.polarity_scores(x)['neu'] for x in covid['virus']]
#corona
covid['corona_neg'] = [analyzer.polarity_scores(x)['neg'] for x in covid['corona']]
covid['corona_pos'] = [analyzer.polarity_scores(x)['pos'] for x in covid['corona']]
covid['corona_neutral'] = [analyzer.polarity_scores(x)['neu'] for x in covid['corona']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Sentiments on the entire review.

In [67]:
covid['Review_neg'] = [analyzer.polarity_scores(x)['neg'] for x in covid['Review_clean']]
covid['Review_pos'] = [analyzer.polarity_scores(x)['pos'] for x in covid['Review_clean']]
covid['Review_neu'] = [analyzer.polarity_scores(x)['neu'] for x in covid['Review_clean']]
covid.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Brand,Date,Product_category,Review_clean,Review_original,Sentiment,Source,pos,no_stop_words,lemmatized_sentences,...,covid_neutral,virus_neg,virus_pos,virus_neutral,corona_neg,corona_pos,corona_neutral,Review_neg,Review_pos,Review_neu
103084,Listerine,3/1/2019,total care mouthwash collection,brush away infection deep tooth necrotizing ti...,You can't brush away infection that is deep in...,1,reddit,"[(brush, VB), (away, RP), (infection, NN), (de...",brush away infection deep tooth necrotizing ti...,brush away infection deep tooth necrotizing ti...,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.229,0.068,0.702
103139,Listerine,3/1/2020,kids mouthwash collection,many realize coronavirus family viruses wuhan ...,@Gurdur Many did not realize coronavirus is a ...,1,twitter,"[(many, JJ), (realize, VBP), (coronavirus, NNS...",many realize coronavirus family viruses wuhan ...,many realize coronavirus family virus wuhan st...,...,1.0,0.173,0.0,0.827,0.223,0.0,0.777,0.347,0.08,0.573
103361,Listerine,3/1/2020,nightly reset,ding dong ding harder practically immune coron...,DING DONG DING.. I AM HARDER THAN YOU?. I AM P...,1,tumblr,"[(ding, VBG), (dong, JJ), (ding, VBG), (harder...",ding dong ding harder practically immune coron...,ding dong ding hard practically immune coronav...,...,1.0,0.0,0.524,0.476,0.0,0.524,0.476,0.0,0.31,0.69
103461,Listerine,3/10/2018,antiseptic mouthwash collection,strangely hungover however taste mouth like ch...,Strangely not hungover However the taste in my...,1,tumblr,"[(strangely, RB), (hungover, NN), (however, RB...",strangely hungover however taste mouth like ch...,strangely hungover however taste mouth like ch...,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.315,0.133,0.552
104050,Listerine,3/10/2020,sensitivity,commercial povidone mouthwash formulations use...,@AESCLEPIUS0 Commercial povidone mouthwash for...,1,twitter,"[(commercial, JJ), (povidone, NN), (mouthwash,...",commercial povidone mouthwash formulations use...,commercial povidone mouthwash formulation use ...,...,1.0,0.0,0.545,0.455,0.0,0.0,1.0,0.0,0.175,0.825


In [191]:
covid.to_csv('covid.csv')


In [68]:
# compute the average pos/neg/neu effect of each of the covid related n-gram categories we identified
covid['ave_pos'] = (covid['covid_pos'] + covid['virus_pos'] + covid['corona_pos'])/3
covid['ave_neg'] = (covid['covid_neg'] + covid['virus_neg'] + covid['corona_neg'])/3
covid['ave_neu'] = (covid['covid_neutral'] + covid['virus_neutral'] + covid['corona_neutral'])/3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [69]:
# determine how covid n-grams contributed to neg/pos/neutral reviwe sentiment
covid_sentiment = {sum(covid['ave_pos'])/len(covid), sum(covid['ave_neg'])/len(covid), sum(covid['ave_neu'])/len(covid)}
covid_sentiment

{0.045343485617597275, 0.08539311900733228, 0.8692639593908625}

In [70]:
# determine overall sentiment for reviews that contained "covid|virus|corona"
review_sentiment = {sum(covid['Review_pos'])/len(covid), sum(covid['Review_neg'])/len(covid), sum(covid['Review_neu'])/len(covid)}
review_sentiment

{0.13315905245346868, 0.14626903553299467, 0.720571912013536}

In [72]:
# Create a df that has all of the n-grams and counts. This is exported and used to create a visualization of these tri-grams
ngram_count = [covid['corona'].value_counts()] + [covid['virus'].value_counts()] + [covid['covid'].value_counts()]

In [73]:
ngram_count

[[]                                                                               409
 [('listerine', 'kills', 'coronavirus')]                                            6
 [('product', 'kill', 'coronavirus')]                                               3
 [('reduce', 'spread', 'coronavirus'), ('could', 'inactivate', 'coronavirus')]      3
 [('infection', 'new', 'coronavirus'), ('tested', 'strains', 'coronavirus')]        3
                                                                                 ... 
 [('mild', 'panic', 'corona')]                                                      1
 [('respiratory', 'kills', 'corona')]                                               1
 [('would', 'kill', 'coronavirus')]                                                 1
 [('throat', 'kill', 'corona')]                                                     1
 [('tested', 'strains', 'coronavirus')]                                             1
 Name: corona, Length: 158, dtype: int64,
 []         

In [74]:
ngram_df = pd.DataFrame(ngram_count).transpose()

In [75]:
ngram_df

Unnamed: 0,corona,virus,covid
[],409.0,192.0,405.0
"[('listerine', 'kills', 'coronavirus')]",6.0,6.0,
"[('product', 'kill', 'coronavirus')]",3.0,3.0,
"[('reduce', 'spread', 'coronavirus'), ('could', 'inactivate', 'coronavirus')]",3.0,,
"[('infection', 'new', 'coronavirus'), ('tested', 'strains', 'coronavirus')]",3.0,3.0,
...,...,...,...
"[('adverse', 'effect', 'covid'), ('posts', 'killing', 'covid')]",,,1.0
"[('quaratineandchill', 'coronavirus', 'covid')]",,,1.0
"[('may', 'activity', 'covid')]",,,1.0
"[('impact', 'ongoing', 'covid')]",,,1.0


In [199]:
ngram_df.to_csv('ngram_count.csv')