In [None]:
import re
import pickle
import pandas as pd
import numpy as np
from datetime import date

import nltk
from nltk.corpus import stopwords

import spacy
from spacy.lang.en import English

import gensim
from gensim import corpora

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
today = date.today()

In [None]:
training_name = "training-data-" + str(today)
vectorizer_name = "vectorizer-" + str(today)
dictionary_name = "dictionary-" + str(today) + ".txt"

In [None]:
nlp = spacy.load('en')

In [None]:
#train_file1 = "B078L78Y8Q-reviews" #1000
#train_file2 = "B06Y5FYDPP-reviews" #305
#train_file3 = "B01F9980XK-reviews" #981
#train_file4 = "B079L5PW79-reviews" #727 Used for testing
#train_file4 = "B01E60WY7G-reviews" #248
#train_file5 = "B01MXQE7YF-reviews" #262
#train_file6 = "B06W9GL9RF-1-reviews" #490
#train_file7 = "B06WWRKRR3-reviews" #278
#train_file8 = "B06ZZ631BF-reviews" #74
#train_file9 = "B07FB1XZGL-reviews" #297
#train_file10 = "B07L71XXBR-reviews" #194
#train_file11 = "B07RXFLZPL-reviews" #321
#train_file12 = "B07S3H1BPS-reviews" #164

train_files = ["B078L78Y8Q-reviews", "B06Y5FYDPP-reviews", "B01F9980XK-reviews",
               "B079L5PW79-reviews", "B01E60WY7G-reviews", "B01MXQE7YF-reviews",
               "B06W9GL9RF-1-reviews", "B06WWRKRR3-reviews", "B06ZZ631BF-reviews",
               "B07FB1XZGL-reviews", "B07L71XXBR-reviews", "B07RXFLZPL-reviews",
               "B07S3H1BPS-reviews"]

In [None]:
data_raw = pd.DataFrame()
for ii in train_files:
    data_raw = data_raw.append(pd.read_csv("Data/" + ii + ".csv", index_col=0))

In [None]:
data_raw.reset_index(drop=True, inplace=True)

In [None]:
data_raw.shape

(5351, 6)

In [None]:
#pd.set_option('display.max_colwidth', 10)

In [None]:
data_raw.tail()

Unnamed: 0,review_author,review_header,review_helpful,review_posted_date,review_rating,review_text
5346,Umadevi.R,Excelent,2 people found this helpful,20 July 2019,5.0 out of 5 stars,So nice good quality.go for it👍
5347,Ajish,Value for money,2 people found this helpful,18 July 2019,5.0 out of 5 stars,Beautiful earrings in affordable price
5348,Ritu bansal,Silver,21 people found this helpful,18 July 2019,5.0 out of 5 stars,Very pretty.perfect packing.Quality of earings...
5349,Tamilarasi,👌,3 people found this helpful,8 July 2019,4.0 out of 5 stars,Super..
5350,PAYEL BHATTACHARYA,Awesome jhumkas,40 people found this helpful,24 June 2019,5.0 out of 5 stars,Just love the jhumkas💓well packed. Delivery on...


In [None]:
data_raw['review_rating'] = data_raw.apply(lambda row: row.review_rating.split(' ')[0], axis=1)

In [None]:
data_raw['review_rating'] = pd.to_numeric(data_raw['review_rating']).astype(int)

In [None]:
data_raw.loc[data_raw.review_text.isnull(), 'review_text'] = ""

In [None]:
data = data_raw.copy()

In [None]:
data['review_header'] = data['review_header'].apply(lambda row: str(row).lower())
data['review_text'] = data['review_text'].apply(lambda row: str(row).lower())

In [None]:
full_review = []
for index, row in data.iterrows():
    review = row.review_header + " " + row.review_text
    full_review.append(review)

In [None]:
data['keywords'] = full_review

In [None]:
data.drop(columns=['review_author', 'review_header', 'review_helpful', 'review_posted_date', 'review_text'], axis=1, inplace=True)

In [None]:
# Convert to list
review_text = data.keywords.values.tolist()
# Remove Emails
review_text = [re.sub(r"\S*@\S*\s?", " ", text) for text in review_text]
# Remove new line characters
review_text = [re.sub(r"\s+", " ", text) for text in review_text]
# Remove distracting single quotes
review_text = [re.sub(r"\'", "", text) for text in review_text]
# Remove all the special characters
review_text = [re.sub(r"\W", " ", text) for text in review_text]
# Remove all single characters
review_text = [re.sub(r"\s+\S\s+", " ", text) for text in review_text]
# Remove single characters from the start
review_text = [re.sub(r"^[a-zA-Z]\s+", "", text) for text in review_text]
# Substituting multiple spaces with single space
review_text = [re.sub(r"\s+", " ", text, flags=re.I) for text in review_text]

In [None]:
#review_text

In [None]:
spell_dict = pd.read_csv('Data/spell-dict.csv', index_col=0, header=0)

In [None]:
# Given a list of words and a number n, return a lis of n-grams.
#Short version
#def getNGrams(wordlist, n):
#    return [wordlist[i:i+n] for i in range(len(wordlist)-(n-1))]

#Long version
def getNGrams(wordlist, n):
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        #ngrams.append(wordlist[i:i+n])
        gram = " ".join(wordlist[i:i+n])
        ngrams.append(gram)
    return ngrams

In [None]:
lines = []
for line in review_text:
    row = line
    for grams in getNGrams(line.split(" "), 1):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)

    for grams in getNGrams(row.split(" "), 3):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)
            
    for grams in getNGrams(row.split(" "), 2):
        if grams in spell_dict.index:
            corrected = spell_dict.loc[grams,'correction']
            row = row.replace(grams, corrected)
            
    lines.append(row)
review_text = lines

In [None]:
# Remove single characters from the start
#review_text = [re.sub(r"^\s", "", line) for line in lines]

In [None]:
data['keywords'] = review_text

In [None]:
#data.to_csv('Data/data-processed.csv')

## Remove stopwords

In [None]:
nltk.download('stopwords')
#nltk.download('wordnet')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
stop_words_dict = open('Data/stop-words-dict.txt', 'r').read().splitlines()

In [None]:
stop_words.extend(stop_words_dict)

In [None]:
len(stop_words)

503

In [None]:
#neg_words = {'hadn', "shouldn't", 'wasn', "wasn't", "doesn't", 'wouldn', 'didn', 'needn', 'against', 'mightn', 'not', "isn't", "don't", "mustn't", 'don', 'weren', "shan't", 'haven', "won't", "didn't", 'shouldn', "wouldn't", 'aren', 'mustn', "hadn't", 'hasn', 'no', "aren't", "needn't", "haven't", "couldn't", 'couldn', 'nor', 'ain', "mightn't", 'doesn', 'isn', "hasn't", 'won', "weren't"}

In [None]:
#stop_words = stop_words.difference(neg_words)

In [None]:
#data.keywords

In [None]:
review_text = [[word for word in gensim.utils.simple_preprocess(str(doc))
                if word not in stop_words] for doc in data.keywords]

In [None]:
#review_text

In [None]:
def lemmatization(texts):
    """https://spacy.io/api/annotation"""
    rows = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        #print(doc[0].lemma_)
        #for token in doc:
            #print(token.lemma_, token.pos_)
            #if (token.pos_ in ['ADV', 'NUM', 'SCONJ']):
                #print(doc, '## ', token.lemma_, token.pos_)
        #texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        #texts_out.append([token.lemma_ for token in doc if token.pos_ not in ['ADV', 'NUM', 'SCONJ']])
        
        # BELOW WAS GIVEN LAST TIME
        lemmas = [token.lemma_ for token in doc if token.pos_ not in ['ADV', 'NUM', 'SCONJ']]
        rows.append(" ".join(lemmas))
    return rows

In [None]:
review_lamm = lemmatization(review_text)
#review_text = lemmatization(review_text)

In [None]:
review_text = review_lamm

In [None]:
#review_text

In [None]:
data['keywords'] = review_text

In [None]:
data['keywords_count'] = data.apply(lambda row: int(len(row['keywords'].split(" "))), axis=1)

In [None]:
data.head()

Unnamed: 0,review_rating,keywords,keywords_count
0,1,break bad,2
1,1,bad refrain,2
2,1,broken find broken,3
3,1,cheap damage show picture neckpiece quality ch...,13
4,4,good good,2


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5351 entries, 0 to 5350
Data columns (total 3 columns):
review_rating     5351 non-null int32
keywords          5351 non-null object
keywords_count    5351 non-null int64
dtypes: int32(1), int64(1), object(1)
memory usage: 104.6+ KB


In [None]:
data['keywords_count'].describe()

count    5351.000000
mean        4.441787
std         3.994099
min         1.000000
25%         2.000000
50%         3.000000
75%         6.000000
max        43.000000
Name: keywords_count, dtype: float64

In [None]:
#data.to_csv("Data/" + training_name + ".csv")

In [None]:
data_for_topics = data['keywords']

## CountVectorizer

In [None]:
count_vectorizer = CountVectorizer(min_df=3,                   # minimum reqd occurences of a word 
                             stop_words=stop_words,            # remove stop words
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             ngram_range=(1,1))

In [None]:
count_vectorized = count_vectorizer.fit_transform(data_for_topics)

In [None]:
# Get feature names
cv_feature_names = count_vectorizer.get_feature_names()
print(len(cv_feature_names))
#print(cv_feature_names)

429


## TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=3,                   # minimum reqd occurences of a word
                             stop_words=stop_words,            # remove stop words
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             ngram_range=(1,1))

In [None]:
tfidf_vectorized = tfidf_vectorizer.fit_transform(data_for_topics)

In [None]:
# Get feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print(len(tfidf_feature_names))
#print(tfidf_feature_names)

429


In [None]:
#vectorizer = count_vectorizer
#data_vectorized = count_vectorized
#feature_names = cv_feature_names

In [None]:
vectorizer = tfidf_vectorizer
data_vectorized = tfidf_vectorized
feature_names = tfidf_feature_names

In [None]:
len(tfidf_vectorizer.get_feature_names())
tfidf_vectorized.shape

(5351, 429)

# Store the Model

In [None]:
vectorizer_model = open("Data/" + vectorizer_name, "wb")

In [None]:
pickle.dump(vectorizer, vectorizer_model)

In [None]:
vectorizer_model.close()

# Store the Dictionary

In [None]:
dictionary_model = open("Data/" + dictionary_name, "w")

In [None]:
for word in feature_names:
    dictionary_model.write(str(word) + "\n")

In [None]:
dictionary_model.close()

## Open the Vectorizer Model and Dictionary to read

In [None]:
vectorizer_model = open("Data/" + vectorizer_name, "rb")

In [None]:
vectorizer = pickle.load(vectorizer_model)

In [None]:
vectorizer_model.close()

In [None]:
dictionary_model = open("Data/" + dictionary_name, "r")

In [None]:
feature_names = dictionary_model.read().splitlines()

In [None]:
dictionary_model.close()

# Training Data Results

In [None]:
rows = data_for_topics.to_list()

In [None]:
#rows

In [None]:
final_topics = []
for row in rows:
    keywords = ""
    #print(row)
    row_transformed = vectorizer.transform([row])
    #print(row_transformed[0:5])
    features = np.flip(row_transformed.indices[np.argsort(row_transformed.data)])
    #print(features[0:5])
    size = features.size
    if (size == 0):
        keywords = "ShortReview"
    else:
        keywords = [feature_names[ii] for ii in features[0:8]]
        keywords = ", ".join(keywords)
    #print('------------')
    
    #print(keywords)
    final_topics.append(keywords)

In [None]:
data['keywords'] = final_topics

In [None]:
data_raw['keywords'] = final_topics

In [None]:
#data_raw.to_csv("Data/" + "training-" + str(today) + ".csv")

## Assigning Categories

In [None]:
category = pd.read_csv('Data/keyword-category.csv', index_col=0, header=0)

In [None]:
category.category.unique()

array(['Manufacturing', 'Perception', 'Price', 'Good', 'Delivery',
       'Domain', 'Service', 'Missing'], dtype=object)

In [None]:
categories = []
for row in final_topics:
    words = row.split(', ')
    row_category = []
    for word in words:
        if (word in category.index):
            word = category.loc[word, 'category']
            if (word != "Domain"):
                row_category.append(word)
        else:
            #print('Category not mapped: ' + word)
            if (word == "ShortReview"):
                row_category.append("ShortReview")

    if (len(row_category) == 0):
        row_category.append("ShortReview")
        
    row_category = ", ".join(row_category)
    categories.append(row_category)

In [None]:
#categories

In [None]:
cat_vectorizer = CountVectorizer()

In [None]:
cat_vectorized = cat_vectorizer.fit_transform(categories)

In [None]:
#cat_vectorizer.vocabulary_

In [None]:
row_cats = []
for ii in range(cat_vectorized.shape[0]):
    row = cat_vectorized.getrow(ii)
    #print(row.indices)
    #print(row.data)
    sorted_cats = np.flip(row.indices[np.argsort(row.data)]).tolist()
    row_cat = [cat_vectorizer.get_feature_names()[jj] for jj in sorted_cats[0:2]]
    row_cats.append(", ".join(row_cat))

In [None]:
#row_cats

In [None]:
data_raw['categories'] = row_cats
data_raw.to_csv("Data/" + "training-" + str(today) + ".csv")