In [None]:
import re
import pickle
import pandas as pd
import numpy as np
from datetime import date

import nltk
from nltk.corpus import stopwords

import spacy
from spacy.lang.en import English

import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


### Pre-requisite Utilties

In [None]:
nlp = spacy.load('en')

In [None]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words_dict = open('Data/stop-words-dict.txt', 'r').read().splitlines()
stop_words.extend(stop_words_dict)

In [None]:
today = date.today()

In [None]:
testing_name = "testing-data-" + str(today)
#vectorizer_name = "vectorizer-" + str(today)
#dictionary_name = "dictionary-" + str(today) + ".txt"
vectorizer_name = "vectorizer-" + "2020-05-17"
dictionary_name = "dictionary-" + "2020-05-17" + ".txt"

## Retrieve the save Vectorizer Model and Dictionary

In [None]:
vectorizer_model = open("Data/" + vectorizer_name, "rb")
vectorizer = pickle.load(vectorizer_model)
vectorizer_model.close()

In [None]:
dictionary_model = open("Data/" + dictionary_name, "r")
feature_names = dictionary_model.read().splitlines()
dictionary_model.close()

In [None]:
#data_file = "B078L78Y8Q-reviews" #1000
#data_file = "B06Y5FYDPP-reviews" #305
#data_file = "B01F9980XK-reviews" #981
data_file = "B079L5PW79-reviews" #727
#data_file = "B07JXZ9KDQ-reviews_Furniture_Table" #TestingFurniture

In [None]:
data_raw = pd.read_csv("Data/" + data_file + ".csv", index_col=0)

In [None]:
data_raw.head()

Unnamed: 0,review_author,review_header,review_helpful,review_posted_date,review_rating,review_text
0,Divya rekha reddy,I like it but size was too big,,26 February 2020,3.0 out of 5 stars,It was too big
1,Amazon Customer,Awesome,,26 February 2020,5.0 out of 5 stars,Good product
2,Kanchan Sharma,The best,,26 February 2020,5.0 out of 5 stars,Ohh my god what a ring\nIts durable its gorgeo...
3,Sijin T Thankachan,NOT SATISFIED,,21 February 2020,1.0 out of 5 stars,DELIVERED LARGER SIZE AND NOT IN GOOD QUALITY
4,Sudip D.,Nice product.,,19 February 2020,4.0 out of 5 stars,"Good, but, size is too short."


In [None]:
data_raw['review_rating'] = data_raw.apply(lambda row: row.review_rating.split(' ')[0], axis=1)
data_raw['review_rating'] = pd.to_numeric(data_raw['review_rating']).astype(int)
data_raw.loc[data_raw.review_text.isnull(), 'review_text'] = ""

In [None]:
data = data_raw.copy()

In [None]:
data['review_header'] = data['review_header'].apply(lambda row: str(row).lower())
data['review_text'] = data['review_text'].apply(lambda row: str(row).lower())

In [None]:
full_review = []
for index, row in data.iterrows():
    review = row.review_header + " " + row.review_text
    full_review.append(review)

In [None]:
data['keywords'] = full_review

In [None]:
data.drop(columns=['review_author', 'review_header', 'review_helpful', 'review_posted_date', 'review_text'], axis=1, inplace=True)

In [None]:
# Convert to list
review_text = data.keywords.values.tolist()
# Remove Emails
review_text = [re.sub(r"\S*@\S*\s?", " ", text) for text in review_text]
# Remove new line characters
review_text = [re.sub(r"\s+", " ", text) for text in review_text]
# Remove distracting single quotes
review_text = [re.sub(r"\'", "", text) for text in review_text]
# Remove all the special characters
review_text = [re.sub(r"\W", " ", text) for text in review_text]
# Remove all single characters
review_text = [re.sub(r"\s+\S\s+", " ", text) for text in review_text]
# Remove single characters from the start
review_text = [re.sub(r"^[a-zA-Z]\s+", "", text) for text in review_text]
# Substituting multiple spaces with single space
review_text = [re.sub(r"\s+", " ", text, flags=re.I) for text in review_text]

In [None]:
#review_text

In [None]:
spell_dict = pd.read_csv('Data/spell-dict.csv', index_col=0, header=0)

In [None]:
# Given a list of words and a number n, return a lis of n-grams.
#Short version
#def getNGrams(wordlist, n):
#    return [wordlist[i:i+n] for i in range(len(wordlist)-(n-1))]

#Long version
def getNGrams(wordlist, n):
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        #ngrams.append(wordlist[i:i+n])
        gram = " ".join(wordlist[i:i+n])
        ngrams.append(gram)
    return ngrams

In [None]:
lines = []
for line in review_text:
    row = line
    for grams in getNGrams(line.split(" "), 1):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)

    for grams in getNGrams(row.split(" "), 3):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)
            
    for grams in getNGrams(row.split(" "), 2):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)
            
    lines.append(row)
review_text = lines

In [None]:
# Remove single characters from the start
#review_text = [re.sub(r"^\s", "", line) for line in lines]

In [None]:
data['keywords'] = review_text

In [None]:
#data.to_csv('Data/data-processed.csv')

## Remove stopwords

In [None]:
len(stop_words)

503

In [None]:
#neg_words = {'hadn', "shouldn't", 'wasn', "wasn't", "doesn't", 'wouldn', 'didn', 'needn', 'against', 'mightn', 'not', "isn't", "don't", "mustn't", 'don', 'weren', "shan't", 'haven', "won't", "didn't", 'shouldn', "wouldn't", 'aren', 'mustn', "hadn't", 'hasn', 'no', "aren't", "needn't", "haven't", "couldn't", 'couldn', 'nor', 'ain', "mightn't", 'doesn', 'isn', "hasn't", 'won', "weren't"}

In [None]:
#stop_words = stop_words.difference(neg_words)

In [None]:
#data.keywords

In [None]:
review_text = [[word for word in gensim.utils.simple_preprocess(str(doc))
                if word not in stop_words] for doc in data.keywords]

In [None]:
#review_text

In [None]:
def lemmatization(texts):
    """https://spacy.io/api/annotation"""
    rows = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        #print(doc[0].lemma_)
        #for token in doc:
            #print(token.lemma_, token.pos_)
            #if (token.pos_ in ['ADV', 'NUM', 'SCONJ']):
                #print(doc, '## ', token.lemma_, token.pos_)
        #texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        #texts_out.append([token.lemma_ for token in doc if token.pos_ not in ['ADV', 'NUM', 'SCONJ']])
        lemmas = [token.lemma_ for token in doc if token.pos_ not in ['ADV', 'NUM', 'SCONJ']]
        rows.append(" ".join(lemmas))
    return rows

In [None]:
review_text = lemmatization(review_text)

In [None]:
#review_text

In [None]:
data['keywords'] = review_text

In [None]:
data['keywords_count'] = data.apply(lambda row: int(len(row['keywords'].split(" "))), axis=1)

In [None]:
data.head()

Unnamed: 0,review_rating,keywords,keywords_count
0,3,size,1
1,5,awesome good,2
2,5,good ohh ring durable gorgeous unique buy,7
3,1,deliver large size good quality,5
4,4,nice good size short,4


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 726 entries, 0 to 725
Data columns (total 3 columns):
review_rating     726 non-null int32
keywords          726 non-null object
keywords_count    726 non-null int64
dtypes: int32(1), int64(1), object(1)
memory usage: 19.9+ KB


In [None]:
#data.to_csv("Data/" + testing_name + ".csv")

In [None]:
data_for_topics = data['keywords']

# Test the Vectorizer Model

In [None]:
rows = data_for_topics.to_list()

In [None]:
#rows

In [None]:
final_topics = []
for row in rows:
    keywords = ""
    #print(row)
    row_transformed = vectorizer.transform([row])
    #print(row_transformed[0:5])
    features = np.flip(row_transformed.indices[np.argsort(row_transformed.data)])
    #print(features[0:5])
    size = features.size
    if (size == 0):
        keywords = "ShortReview"
    else:
        keywords = [feature_names[ii] for ii in features[0:8]]
        keywords = ", ".join(keywords)
    #print('------------')
    
    #print(keywords)
    final_topics.append(keywords)

In [None]:
data['keywords'] = final_topics

In [None]:
data_raw['keywords'] = final_topics

In [None]:
#data_raw.to_csv("Data/" + "testing-" + str(today) + ".csv")

## Assigning Categories

In [None]:
category = pd.read_csv('Data/keyword-category.csv', index_col=0, header=0)

In [None]:
categories = []
for row in final_topics:
    words = row.split(', ')
    row_category = []
    for word in words:
        if (word in category.index):
            word = category.loc[word, 'category']
            if (word != "Domain"):
                row_category.append(word)
        else:
            #print('Category not mapped: ' + word)
            if (word == "ShortReview"):
                row_category.append("ShortReview")

    if (len(row_category) == 0):
        row_category.append("ShortReview")
        
    row_category = ", ".join(row_category)
    categories.append(row_category)

In [None]:
#print(categories)

In [None]:
cat_vectorizer = CountVectorizer()

In [None]:
cat_vectorized = cat_vectorizer.fit_transform(categories)

In [None]:
#cat_vectorizer.vocabulary_

In [None]:
row_cats = []
for ii in range(cat_vectorized.shape[0]):
    row = cat_vectorized.getrow(ii)
    #print(row.indices)
    #print(row.data)
    sorted_cats = np.flip(row.indices[np.argsort(row.data)]).tolist()
    row_cat = [cat_vectorizer.get_feature_names()[jj] for jj in sorted_cats[0:2]]
    row_cats.append(", ".join(row_cat))

In [None]:
#row_cats

In [None]:
data_raw['categories'] = row_cats
data_raw.to_csv("Data/" + "testing-" + str(today) + ".csv")