In [None]:
# from IPython.core.display import HTML
# HTML("""
# <style>
# .output_png {
#     display: table-cell;
#     text-align: center;
#     vertical-align: middle;
# }
# </style>
# """)

In [8]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# NLP Introduction Notebook

In this notebook we'll revise a brief introduction into Natural Language Processing using Python.

## Python libraries for NLP

There exist a few main librarires to perform NLP, those are: Wordcloud, Spacy and NLTK, so we'll first install them in our environment in the following cell. Note that for spacy we dowload english language models, those will help us in analysing Part-Of-Speech, Named Entity Recognition, Lemmatizing, and more, which we'll explain further in this notebook.

Note: If the following cell is still not installing the libraries in your environment, you can run them from the terminal (without the '!').

You can read more about them in the following links:
- Spacy: https://spacy.io/api/doc/
- NLTK: https://www.nltk.org/
- Wordcloud: https://amueller.github.io/word_cloud/
- Yellowbrick: https://www.scikit-yb.org/en/latest/

Important: if there's any trouble opening this notebook, use: 'conda install -c conda-forge jupyter_contrib_nbextensions '

In [None]:
# Uncomment the following lines for installing the Spacy and NLTK libraries.
# ! pip install pandas
# ! pip install wordcloud
# ! pip install nltk
# ! pip install spacy
# ! pip install pyldavis
# ! pip install gensim
# ! pip install yellowbrick
# ! pip install vaderSentiment
# ! python -m spacy download en_core_web_sm
# ! python -m spacy download en_core_web_md
# ! python -m spacy download en_core_web_lg

Now we can import the libraries. We'll also be using pandas for data importation and manipulation, and matplotlib for visualization.

In [9]:
# Data wrangling and visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
from yellowbrick.text import FreqDistVisualizer

# Text mining
from gensim import models, corpora
from gensim.models import LdaModel, CoherenceModel
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pyLDAvis.gensim
import spacy

from nltk.cluster import KMeansClusterer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

import re
import time
# nltk.download('punkt') # Uncomment this line if using for the first time the NLTK library
# nltk.download('stopwords') # Uncomment this line if using for the first time the NLTK library

In this notebook we'll be working with the following dataset, which contains information about reviews on Amazon products: https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products.

We will start by importing the '1429_1.csv' file which is the smallest of the three files downloaded from the Kaggle link above.

In [10]:
df = pd.read_csv("../data/archive/1429_1.csv", encoding="utf-8")

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs 

We can observe that the amount of columns may serve for multiple types of analysis of sales on the Amazon products. For the sake of this notebook we'll focus in those that represent text information.

In [12]:
df = df[["name", "brand", "categories", "manufacturer", "reviews.date", "reviews.rating", "reviews.text", "reviews.title"]]

The dataset covers many products, we can see the most reviewed ones in the following cell.

In [13]:
df[["name", "reviews.text"]].groupby(["name"]).count().sort_values(["reviews.text"], ascending=False).head()

Unnamed: 0_level_0,reviews.text
name,Unnamed: 1_level_1
"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta",10966
"Echo (White),,,\r\nEcho (White),,,",3309
"Amazon Kindle Paperwhite - eBook reader - 4 GB - 6 monochrome Paperwhite - touchscreen - Wi-Fi - black,,,",3176
"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta",2814
"Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,",2527


We'll focus on only the rating, text and title fields.

In [14]:
# df_most_reviews = df[df["name"] == "Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta"]
df = df[["reviews.rating", "reviews.text", "reviews.title"]]
df.columns = ['rating', 'text', 'title']

Dropping NA values in any of the text or title fields

In [15]:
df = df.dropna(axis=0, how='any', subset=['text', 'title'])

In case of wanting to run the notebook faster, optionally we can grab a sample with a smaller amount of rows.

In [None]:
df = df.sample(n=5000, random_state=1)

The count of reviews grouped by rating:

In [None]:
df[["rating", "text"]].groupby(["rating"]).count()

## Text Distribution

We can visualize the length of a set of texts, which first we need to get the count of words in each of the analyzed texts.

In [None]:
lista_title_count = []
lista_text_count = []
for index, row in df.iterrows():
    title_count = 0
    text_count = 0
    title_count=len((str(row['title'])))
    text_count=len((str(row['text'])))
    lista_title_count.append(title_count)
    lista_text_count.append(text_count)
    
df['title_count'] = lista_title_count
df['text_count'] = lista_text_count

In [None]:
df.head()

Before moving on into more specific analysis, first we will set every text to lowercase.

In [None]:
df["text"] = df["text"].str.lower()
df["title"] = df["title"].str.lower()

In the next plot, we can visualize how an average of reviews are found to have from 10 to 30 words in the title.

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.displot(df, x="title_count", col="rating")

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.displot(df[df['rating']==5.0], x="title_count")

In the next plot, we can visualize the distribution for the body texts on the reviews.

In [None]:
sns.displot(df, x="text_count", col="rating")

In [None]:
sns.displot(df[df['rating']==5.0], x="text_count")

In the plot above we can observe that there are some reviews that are too long, we can prove that there exist these outlier reviews by plotting a box plot.

In [None]:
sns.boxplot(x="rating", y="text_count", data=df)

In [None]:
df[df['text_count'] > 5000]

## Wordclouds

A wordcloud may give insight into which words are mentioned the most in a set of texts, as shown in the following example wordcloud:
<p></p>
<div>
<img src="wordcloud_example.jpg" width="500"/>
</div>

For the titles of each review we can obtain the wordcloud as shown below:

In [None]:
# Parse all the rows into a single string
title_text = " ".join(text for text in df["title"])
print ("Hay {} palabras en titulos.".format(len(title_text)))

# Generate a word cloud image:
wordcloud = WordCloud().generate(title_text)

# Display the generated image:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud Titles")
plt.show()

More specifically, we could observe what the wordcloud would be like for each of the ratings in the reviews:

In [None]:
rating = 1.0
# Parse all the rows into a single string
title_text = " ".join(text for text in df[df["rating"] == rating]["title"])
print ("Hay {} palabras en titulos.".format(len(title_text)))

# Generate a word cloud image:
wordcloud = WordCloud().generate(title_text)

# Display the generated image:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud Titles")
plt.show()

Following the same process, we can apply the same function to the reviews body texts.

In [None]:
# Parse all the rows into a single string
body_text = " ".join(text for text in df["text"])
print ("Hay {} palabras en textos.".format(len(body_text)))

# Generate a word cloud image:
wordcloud = WordCloud().generate(body_text)

# Display the generated image:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud texts")
plt.show()

We can perform the same rating-specific approach as with the titles.

In [None]:
rating = 5.0
# Parse all the rows into a single string
body_text = " ".join(text for text in df[df["rating"] == rating]["text"])
print ("Hay {} palabras en textos.".format(len(body_text)))

# Generate a word cloud image:
wordcloud = WordCloud().generate(body_text)

# Display the generated image:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud texts")
plt.show()

## Most frequent words

Now we will introduce a couple of useful functions. 

We will being by assigning the stopwords information into the sw variable.

In [None]:
sw = stopwords.words("english")

Using NLTK FreqDist

In [None]:
rating = 5.0
if rating == None:
    words = " ".join(text for text in df["title"])
else:
    words = " ".join(text for text in df[df["rating"] == rating]["title"])
words = nltk.word_tokenize(words)
words = [word for word in words if word not in sw and re.match('([a-zA-Z0-9]+)',word)]
freqdist = nltk.FreqDist(words)
plt.figure(figsize=(12,8))
plt.title("Frecuencia de palabras en Title")
freqdist.plot(50)

Using Yellowbricks FreqDistVisualizer

In [None]:
# Load the text data
rating = 1.0
if rating == None:
    corpus = df["title"]
else:
    corpus = df[df["rating"] == rating]["title"]

vectorizer = CountVectorizer(stop_words='english')
docs       = vectorizer.fit_transform(text for text in corpus)
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(
    features=features, size=(1080, 720)
)
visualizer.fit(docs)
visualizer.show()

For the text in the reviews

Using NLTK FreqDist

In [None]:
rating = 5.0
if rating == None:
    words = " ".join(text for text in df["text"])
else:
    words = " ".join(text for text in df[df["rating"] == rating]["text"])
words = nltk.word_tokenize(words)
words = [word for word in words if word not in sw and re.match('([a-zA-Z0-9]+)',word)]
freqdist = nltk.FreqDist(words)
plt.figure(figsize=(12,8))
plt.title("Frecuencia de palabras en Text")
freqdist.plot(50)

Using Yellowbrick FreqDistVisualizer

In [None]:
# Load the text data
rating = 5.0
if rating == None:
    corpus = df["text"]
else:
    corpus = df[df["rating"] == rating]["text"]

vectorizer = CountVectorizer(stop_words='english')
docs       = vectorizer.fit_transform(text for text in corpus)
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(
    features=features, size=(1080, 720)
)
visualizer.fit(docs)
visualizer.show()

## Sentiment Analysis

Ratings of 1.0, 2.0, 3.0 may be considered bad.
Ratings of 4.0, 5.0 may be considered good.

We begin by initializing the sentiment analyzer.

In [None]:
analyser = SentimentIntensityAnalyzer()

In [None]:
def sentiment_analyzer_scores(sentence, show = False):
    score = analyser.polarity_scores(sentence)
    if show:
        print("{:-<40} {}".format(sentence, str(score)))
    return score

In [None]:
score = sentiment_analyzer_scores("I think the movie was incredibly bad!!:(", True)
type(score)

In [None]:
lista_sentiment_neg = []
lista_sentiment_neu = []
lista_sentiment_pos = []
lista_sentiment_comp = []

for index, row in enumerate(df.itertuples(), 1):
    sentiment = sentiment_analyzer_scores(row.text)
    lista_sentiment_neg.append(sentiment['neg'])
    lista_sentiment_neu.append(sentiment['neu'])
    lista_sentiment_pos.append(sentiment['pos'])
    lista_sentiment_comp.append(sentiment['compound'])

df['sentiment_neg'] = lista_sentiment_neg
df['sentiment_neu'] = lista_sentiment_neu
df['sentiment_pos'] = lista_sentiment_pos
df['sentiment_comp'] = lista_sentiment_comp

In [None]:
df.head()

In [None]:
sns.boxplot(data=df, x='rating', y='sentiment_comp')

## Tokenization

Tokenization is the process of demarcating and possibly classifying sections of a string of input characters. The resulting tokens are then passed on to some other form of processing. The process can be considered a sub-task of parsing input.

In [None]:
example_text = "The incredible tale of a man who formed an unlikely bond with an octopus ! ?"
tokens = nltk.word_tokenize(example_text)
print(tokens)

In [None]:
lista_tokens = []

for index, row in enumerate(df.itertuples(), 1):
    sentence = row.text
    tokens = nltk.word_tokenize(sentence)
    lista_tokens.append(tokens)
    
df['tokens_text'] = lista_tokens

In [None]:
df['tokens_text'].head()

## Text Cleaning

Stemming and lemmatization are used to clean a dataset by either cutting the words into a root form (stem), or by replacing them to their equivalent word that would be found in a dictionary (lemma).

Normalizing format is important when dealing with alphabetical texts. There exist more techniques that allow for the cleaning to respect e.g. emojis

In [None]:
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')

### Stemming

In [None]:
stemmer = SnowballStemmer(language='english')

In [None]:
lista_text_stem = []

for index, row in enumerate(df.itertuples(), 1):
    tokens = row.tokens_text
    stems = []
    for token in tokens:
        stem = stemmer.stem(token)
        if len(stem) > 0:
            stems.append(stem)
    text_stem = " ".join(text for text in stems)
    lista_text_stem.append(text_stem)
    
df['text_stem'] = lista_text_stem

In [None]:
df['text_stem'].head()

### Lemmatization

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
demo_text = "The red fox jumps through the wall and hides beneath the tree."

for token in nlp(demo_text):
    print(f"{'token: ' + str(token):<15} - {'lemma: ' + token.lemma_:<15} - {'pos: ' + token.pos_:<10} - {'ent: ' + token.ent_type_:<10}")

In [None]:
demo_text = "Michael Jordan talked with Nike about a new deal worth $1B USD! #money #basketball"

for token in nlp(demo_text):
    print(f"{'token: ' + str(token):<15} - {'lemma: ' + token.lemma_:<15} - {'pos: ' + token.pos_:<10} - {'ent: ' + token.ent_type_:<10}")

In [None]:
lista_text_lemma = []

for sentence in list(nlp.pipe(df['text'], disable=['ner', 'parser'])):
    lemmas = []
    for token in sentence:
        lemma = str(token.lemma_)
        if (len(lemma) > 0) and (lemma != '-PRON-') and (lemma not in sw) and (re.match('([a-zA-Z]+)',token.text) != None):
            lemmas.append(lemma)
    text_lemma= " ".join(text for text in lemmas)
    lista_text_lemma.append(text_lemma)
    
df['text_lemma'] = lista_text_lemma

In [None]:
df['text_lemma'].head()

In [None]:
df.to_csv("../data/results.csv", index=False)

## Topic Modeling

What are the main topics of the reviews?
Quick LDA and pyLDAvis

In [None]:
tokenized = []
def train_lda(data, num_topics=10):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    chunksize = 300
    t1 = time.time()
    for doc in list(nlp.pipe(df['text_lemma'], disable=['ner', 'parser'])):
        tokens = []
        for token in doc:
            tokens.append(token.text)
        tokenized.append(tokens)
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized]
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)
    t2 = time.time()
    print("Time to train LDA model on ", len(df), "articles: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [None]:
dictionary,corpus,lda = train_lda(df, num_topics=6)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=tokenized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=tokenized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
text

In [None]:
bow = dictionary.doc2bow(text.split())
t = lda.get_document_topics(bow)
t

In [None]:
list_topics = []
list_topic_scores = []
for index, row in enumerate(df.itertuples(), 1):
    text = row.text
    bow = dictionary.doc2bow(text.split())
    t = lda.get_document_topics(bow)
    topic = sorted(t, key=lambda x: x[1], reverse=True)[0][0]
    topic_score = sorted(t, key=lambda x: x[1], reverse=True)[0][1]
    list_topics.append(topic)
    list_topic_scores.append(topic_score)
df['topicLDA'] = list_topics
df['topicLDA_score'] = list_topic_scores

In [None]:
df.to_csv("../data/results.csv", index=False)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
vis

## Clustering

K-Means with silhouette score and more.

In [None]:
df.text_lemma = df.text_lemma.fillna(' ')
encode = CountVectorizer(binary=True)
freqs = encode.fit_transform(df['text_lemma'])
vect = [freq.toarray()[0] for freq in freqs]

In [None]:
model = KMeansClusterer(5, nltk.cluster.util.euclidean_distance, avoid_empty_clusters = True,rng=0)

In [None]:
clusters = model.cluster(vect, True, trace=True)

In [None]:
print("Clustered:", vect)
print("As:", clusters)
print("Means:", model.means())

In [None]:
df['clusters_eucliean'] = clusters 
df.to_csv("../data/results.csv", index=False)

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
import numpy

bow = CountVectorizer()
X_bow = bow.fit_transform(df.text_lemma)
        
visualizer = SilhouetteVisualizer(KMeans(n_clusters=5, random_state=0))
visualizer.fit(X_bow)
visualizer.poof()

In [None]:
from yellowbrick.cluster import KElbowVisualizer

visualizer = KElbowVisualizer(KMeans(random_state=0), metric='silhouette', k=range(3, 21, 1))
visualizer.fit(X_bow)
visualizer.poof()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from yellowbrick.text import TSNEVisualizer

# Load the data and create document vectors
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df.text)
y = df.clusters_nltk_cos

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer()
tsne.fit(X, y)
tsne.show()