In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.layers import Dense,LSTM
from keras.models import Sequential

In [None]:
df = pd.read_csv("../input/amazon-music-reviews/Musical_instruments_reviews.csv")

In [None]:
df.head()

Lets get the exactly columns names so to avoid mismatching strings checks

In [None]:
df.columns

Checking for "Not a Numbers" values is always a good procedure before any data munipulation:

In [None]:
df.isna().sum()

As the column "Review Text" has 27 NaN values. Lets replace those by just an empty string "". For that we'll use the method *fillna()*:

In [None]:
df.reviewText.fillna("",inplace = True)

Now, as we are only interested in predicting the rating of a user based on his review, we can delete all the other informations of our data:

In [None]:
del df['reviewerID']
del df['asin']
del df['reviewerName']
del df['helpful']
del df['unixReviewTime']
del df['reviewTime']

This is our new data:

In [None]:
df.head()

In [None]:
df["quality"] = df.loc[:,"overall"].apply(lambda x : "good" if x >= 4 else ("neutral" if x==3 else "bad" ))
df["strQuality"] = df.loc[:,"quality"].apply(lambda x : 2 if x == "good" else (1 if x== "neutral" else 0 ))

In [None]:
df.head()

As the "Summary" can add some useful information about the overall review, we gonna simply merge this column info with the "Review Text". 

In [None]:
df['text'] = df['reviewText'] + ' ' + df['summary']
del df['reviewText']
del df['summary']

Now lets get some statistical info: how many users rated their bought as 5.0? How many as 1.0? We can do that simply running the *value_counts()* on the desired column:

In [None]:
df.overall.value_counts()

In [None]:
for i,each in enumerate(df.overall.value_counts()):
    print(f"Percentage of {df.overall.value_counts().index[i]} stars : {(each*100/len(df.overall)):.2f}")

We can see that we have a lot rating their bought with 5 stars (68%) and only a few with 1 (2%).

To perform NLP is interesting to remove unecessary words and symbols that may just overload our model and don't add meaningful information (this is the case of ponctuation and some words connecting phrases) This words are called as *STOPWORDS* and it is a pre-built list. For each language it is composed of specific words, evidently. So firts we get that list and store it in our variable, "stop". Next, we get a similar list but this time with the punctuation! Finally, we can update our "stop" list with also this punctuation. We are creating this "stop" list as we'll need to remove them from our "reviews" in order to clean it up and proceed to our training!

In [None]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation) ### adding the punctioation as stopwords as well!

Take a look in what is our "stop" list so far:

In [None]:
stop

At this point you may be asking if some of those words could not be RELEVANT to understand the user's review!! Well, it depends on how you gonna analyse the data. If you want to use a LSTM algorithm, we may lose important information if we just delete the "binding" words. As we will also train a LSTM model, I'll create a list called "rem" to remove from my "stop" list the words that could be meaningful for the LSTM model.

In [None]:
rem = ["aren't", "aren", "but", "couldn", "couldn't", "don", "don't","didn", "didn't", "doesn", "doesn't", "wouldn", "wouldn't", "won", "won't", "weren", "weren't", "wasn", "wasn't", "should", "shouldn't", "needn", "needn't", "mustn", "mustn't", "mightn", "mightn't", "isn", "isn't", "haven", "haven't", "hasn", "hasn't", "hadn", "hadn't","not", "no"]

In [None]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

We'll *LEMMATIZE*!!! What??!! Yes, lemmatize is the process of getting inflected words and treated them as the same. For instace, the word, "rocks", "rocky" will be seen as "rock". This is a way to schrink our data. 

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    final_text = []
    for i in text.split():
        #print("word : ", i)
        if i.strip().lower() not in stop:
            pos = pos_tag([i.strip()])
            #print("pos : ", pos)
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            #print("Lemma word : ", word)
            final_text.append(word.lower())
    return " ".join(final_text)

Lets check it with a small example. I create a list "z" with 3 sentences. 

In [None]:
z = ["I don't knowing know know, but don't care","I would like you know","Don't care care cares"]

So, when we call "lemmatize_words(each)" we'll first get off the "stop" words and them build our new lemmatized phares:

In [None]:
for each in z:
    r = lemmatize_words(each)
    print("our r : ", r)

You can see that for our 1st sentence, "knowing" and "know" are seen as the same word - "kwno". The 3rd one tells us that "care" and "cares" are seen as "care"

If you want to take a look deeper in what these "Reviews" look like, uncomment the 2 cells below (for the "good" and "neutral" rating respectvely)

In [None]:
# for i in range(100):
#     if df.loc[i,"quality"] == "good":
#         print(i,"\n", df.loc[i,"text"])

In [None]:
# for i in range(100):
#     if df.loc[i,"quality"] == "neutral":
#         print(i,"\n", df.loc[i,"text"])

Lets do it in our "text" column (it can take some time, as we are doing that in a not that small data):

In [None]:
df.text = df.text.apply(lemmatize_words)

Check the data now:

In [None]:
df.head()

Now lets just take a list of each of our possible ratings ("good", "neutral" and "bad"). (This will be useful for creating the WordPlotting that comes next):

In [None]:
good = df.text[df.quality == "good"]
neutral = df.text[df.quality == "neutral"] #.drop(columns = "overall")
bad = df.text[df.quality == "bad"] # .drop(columns = "overall")
good.shape,bad.shape,neutral.shape


#  Plotting WordClouds: 

In [None]:
fig = plt.figure(figsize=(20,30))
qual = {0 : ["neutral",neutral], 1 : ["bad", bad], 2 : ["good",good]}
qual[0][0]
for i in range(3):
    ax = fig.add_subplot(1,3,i+1)
    wc = WordCloud(min_font_size = 3,  max_words = 3000 , width = 1600 , height = 800).generate(" ".join(qual[i][1]))
    #wc.recolor(color_func = grey_color_func)
    ax.imshow(wc,interpolation = 'bilinear')
    plt.xlabel(qual[i][0])
    #ax.axis('off')

WordPlot can always be a cool and interesting way to visualize word frequency in a data. But lets be more precise and check a histogram, because we cannot exactly tell "how much" they are bigger or smaller within each case and compared to the others.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from yellowbrick.text import FreqDistVisualizer
from yellowbrick.datasets import load_hobbies

for i in range(3):  
    fig = plt.figure(figsize=(15,3))
    corpus = qual[i][1]
    vectorizer = CountVectorizer()
    docs       = vectorizer.fit_transform(corpus)
    features   = vectorizer.get_feature_names()

    visualizer = FreqDistVisualizer(features=features, orient='v',n=10, title=["Frequency of 10 words for : " + qual[i][0]])
    visualizer.fit(docs)    
    visualizer.show()

What if we consider "words" composed of 2 to 3 words? This can be achieved changing a parameter in the CountVectorizer funciont (further explanation can be seen in the next steps)

In [None]:
for i in range(3):  
    fig = plt.figure(figsize=(15,3))
    corpus = qual[i][1]
    vectorizer = CountVectorizer(min_df=0,binary=False,ngram_range=(2,3)) ### We changed this parameter!!
    docs       = vectorizer.fit_transform(corpus)
    features   = vectorizer.get_feature_names()

    visualizer = FreqDistVisualizer(features=features, orient='v',n=10, title=["Frequency of 10 words for : " + qual[i][0]])
    visualizer.fit(docs)    
    visualizer.show()

We can observe that "work well" is not a good vocabulary to distinguish btw the rates as it appears with high frequency in all the 3 cases, specially because we have a very UNBALANCED data. Could we penalize our model if he sees it when training?

# CountVectorizer and TFI:

Now we're gonna to transform each of our sentence in a Matrix!!! Yes!! This is how we gonna treat those complex words expressed by humans: with NUMBERS!! For that we call the method "CountVectorizer"! 

Take a look at our small example from our "z" list of 3 sentences that we created previously:

In [None]:
#### Remember our "z" list:
#### z = ["I don't knowing know know, but don't care","I would like you know","Don't care care cares"]
cvz=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(2,3))
#cv=CountVectorizer(ngram_range=(2,3))

cv_testz=cvz.fit_transform(z)
cvz.get_feature_names() ### Take a look at what this method does:

You can play with the *ngram_range* parameter and see what happens!! Roughly it says to our method to consider only a bag of words that has between 2 and 3 words!! Next, lets see in terms of numbers and matrix what that represents and how the ML model will see our data:

In [None]:
cv_testz.toarray()

So, for our 1st sentence (the first vector of our matrix), it says that the first elements considered in our bag "but don" appears in our sentence 1 times - this is the first "1" of our vector. After, he says that the second element of our bag "but don care" appears once also. The thirds element ("care care" doesn't appear anytime). "care" appears, but NOT "care care". And so on...

What if we want to emphazise a vocabulary? Lets say that the combination "you know" is important. To highlight this, we can make our model increase its weight. To do so, lets first see the index term of "you know" in our cv matrix:

In [None]:
cvz.vocabulary_["you know"]

Ok, so now lets say to our matrix that we want a doubled weight to its value. So lets multiply its value by 2:

In [None]:
a = cv_testz.toarray()
for i in range(3):
       a[i][18] = a[i][18]*2 
display(cv_testz.toarray())
display(a)

Ok, we can observe that our last element of the 2nd vector changed to 2. The others didn't because multiplying any number to ZERO is still ZERO nowadays... -,-

What about TfidVectorizer???

In [None]:
#z = ["I don't know know know, but don't care","I like you know","Don't care care care"]
tvt=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(2,3))
#transformed reviews
tvt_test=tvt.fit_transform(z)

tvt.get_feature_names()

So far, we don't see any difference btw CountVect and TfdiVect. But what about our matrix?

In [None]:
tvt_test.toarray()
#z = ["I don't know know know, but don't care","I like you know","Don't care care care"]


Quite different!! This is because the Tdfi will take the inverse of the frequence of each element considered. It's a way to normalize our data! So, the more commom is a "word" in our document, the higher its frequency, and so the lower its score. Inversely, words that are "unique" will have a lower frequency, thus higher score.

After that we prepared our data, it's time to TRAIN our models!!! YEAHHH

# Logistic Regression!

First, lets split our data!

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df.text,df.quality,test_size = 0.2 , random_state = 0)

In [None]:
cv=CountVectorizer(min_df=0,binary=False,ngram_range=(2,3))
#cv=CountVectorizer(ngram_range=(2,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(x_train)
#transformed reviews
cv_test_reviews=cv.transform(x_test)
#display(cv_train_reviews.toarray())
print('cv_train:',cv_train_reviews.shape)
print('cv_test:',cv_test_reviews.shape)

tv=TfidfVectorizer(min_df=0,use_idf=True,ngram_range=(2,3))
#transformed reviews
tv_train_reviews=tv.fit_transform(x_train)

tv_test_reviews=tv.transform(x_test)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)


In [None]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=0)
#Fitting
lr_bow=lr.fit(cv_train_reviews,y_train)
print(lr_bow)

lr_tfidf=lr.fit(tv_train_reviews,y_train)
print(lr_tfidf)

#Predicting 
lr_bow_predict=lr.predict(cv_test_reviews)

lr_tfidf_predict=lr.predict(tv_test_reviews)

#Accuracy score 
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

We got a score of 89%.

In [None]:
#report
lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['good','neutral','bad'])
print(lr_bow_report)


lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['good','neutral','bad'])
print(lr_tfidf_report)

In [None]:
lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['good','neutral','bad'])
print(lr_bow_report)


lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['good','neutral','bad'])
print(lr_tfidf_report)

Before moving to the LSTM model, lets see what happen if we change some vocabulary weight. "sound like" and "planet wave", "they re" and "sound good"

In [None]:
cv.vocabulary_["sound good"]

In [None]:
def tuning(pen, inc):
    global tr, te
    tr = cv_train_reviews.toarray()
    te = cv_test_reviews.toarray()

    voc = ["planet waves", "sound like", "work well", "sound good", "they re"] ## work well we're goind to penalize

    for each in voc:
        idx = cv.vocabulary_[each]
        if each == "work well": #### PENALIZING
            for i in range(cv_train_reviews.shape[0]):
                tr[i][idx] = int(tr[i][idx]//pen) 
            for i in range(cv_test_reviews.shape[0]):
                te[i][idx] = int(te[i][idx]//pen)
        else:##### INCREASING THE WEIGHT
            for i in range(cv_train_reviews.shape[0]):
                tr[i][idx] = tr[i][idx]*inc 
            for i in range(cv_test_reviews.shape[0]):
                te[i][idx] = te[i][idx]*inc
    tr_sm = sparse.csr_matrix(tr)
    te_sm = sparse.csr_matrix(te)
    
    return tr_sm, te_sm

In [None]:
display(tr_sm)
display(cv_train_reviews)

In [None]:
results = {}

for inc in range(2,23,10):
    for pen in range(2,3):
        tr_sm, te_sm = tuning(pen,inc)
        Lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=0) ## We are already penalizing!

        lr_bow=Lr.fit(tr_sm,y_train)
        print(lr_bow)

        lr_bow_predict=Lr.predict(te_sm)

        #Accuracy score 
        lr_bow_score=accuracy_score(y_test,lr_bow_predict)
        print("lr_bow_score :",round(lr_bow_score,6))
        mod = "Model: increase " + str(inc) + ", penalize in " + str(pen)
        results[mod] = round(lr_bow_score,6)

This didn't increase too much our performance (only 0.002). As we have a 

In [None]:
results

# LSTM

As we said previously, I'll remove certain words (in the "rem" list) from "stop". For that, we'll discad each of them from "stop":

In [None]:
df = pd.read_csv("../input/amazon-music-reviews/Musical_instruments_reviews.csv")
df.reviewText.fillna("",inplace = True)
del df['reviewerID']
del df['asin']
del df['reviewerName']
del df['helpful']
del df['unixReviewTime']
del df['reviewTime']

df["quality"] = df.loc[:,"overall"].apply(lambda x : "good" if x >= 4 else ("neutral" if x==3 else "bad" ))
df["strQuality"] = df.loc[:,"quality"].apply(lambda x : 2 if x == "good" else (1 if x== "neutral" else 0 ))

df['text'] = df['reviewText'] + ' ' + df['summary']
del df['reviewText']
del df['summary']

In [None]:
df.head()

In [None]:
for each in rem:
    stop.discard(each)
stop ## check the new list to see if it's smaller:

In [None]:
df.text = df.text.apply(lemmatize_words)

In [None]:
def final(X_data_full):
    
    cv = CountVectorizer(min_df = 0, max_features=1000, ngram_range =(2,3))
    X_full_vector = cv.fit_transform(X_data_full).toarray()    
    
    full = X_full_vector
    print("our full: ", full)
    voc = ["planet waves", "sound like", "work well", "sound good", "they re"] ## work well we're goind to penalize
    
    try:
        for each in voc:
            idx = cv.vocabulary_[each]
            if each == "work well": #### PENALIZING
                for i in range(X_full_vector.shape[0]):
                    full[i][idx] = int(full[i][idx]//2) 
            else:##### INCREASING THE WEIGHT
                for i in range(X_full_vector.shape[0]):
                    full[i][idx] = full[i][idx]*inc
    except:
        print("didn't work!")
    full_sm = sparse.csr_matrix(full)
    
    tfidf = TfidfTransformer()
    X_data_full_tfidf = tfidf.fit_transform(full_sm).toarray()
    
    return X_data_full_tfidf
    

In [None]:
x = final(df.text)

Obs: to our ydata, we're getting the "strQuality" column as for the LSTM model we will use a "categorical-crossentropy" analyse. So we need our target as number classes

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,df.strQuality,test_size = 0.2 , random_state = 0)

In [None]:
XX = x_train

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

embedding_size=32
max_words=5000

model = Sequential()
model.add(Embedding(max_words, embedding_size, input_length=1000 )) #x_train.shape[0]))
model.add(Bidirectional(LSTM(16, return_sequences = True)))
model.add(Bidirectional(LSTM(16)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3,activation='softmax'))

print(model.summary())

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

y_train_dummies = pd.get_dummies(y_train).values
print('shape label tensor: ', y_train_dummies.shape)

#trainingggg the model
model.fit(XX, y_train, epochs=2, batch_size=32)


In [None]:
# display(XX.shape)
# display(XX[:int(len(XX)/5),:].shape)

In [None]:
# converting categorical var in y_train to numerical var
y_test_dummies = pd.get_dummies(y_test).values
print('Shape of Label tensor: ', y_test_dummies.shape)

#model = load_model('../output/MusicalInstrumentReviews_correct.h5')
scores = model.evaluate(XX[:int(len(XX)/4)+1,:], y_test)

LSTM_accuracy = scores[1]*100

print('Test accuracy: ', scores[1]*100, '%')

Test accuracy: 88.89%



# Conclusions 

1. NLP is not a trivial task!!
2. when using LSTM, we may take a carefully look at the words that we want to remove from our data, or we can miss important and meaningful information
3. we may have a lot of room to improvement, specially regarding the bag of words to be used. Playing with the ngram_range is definetly an important key to do so.