# Supervised Feature Engineeering of Reviews for Cellphone and Accessories category on Amazon 

In [None]:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

import model_evaluation_utils as meu
np.set_printoptions(precision=2, linewidth=80)
from nltk.tokenize.toktok import ToktokTokenizer
import warnings
from sklearn.linear_model import  SGDClassifier
from gensim.models.fasttext import FastText
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import spacy
import matplotlib.pyplot as plt
import numpy as np
import utils
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
import model_evaluation_utils as meu

warnings.filterwarnings("ignore")
nlp = spacy.load('en_vecs', parse=False, tag=False, entity=False)

np.set_printoptions(precision=2, linewidth=80)


NWORKERS=16

NUMFEATURES=100


Using TensorFlow backend.


# Load normalized data from processed file


In [None]:
dfdb = utils.readFromDisk('processed_partB')


# Sample processed data loaded

In [None]:
print('Total Rows on processed dataset: ' + str(len(dfdb)))
print('Sample of processed dataset. Notice the column named Clean_Review');
dfdb.head(20)


# Split train and test data for both normalized and raw


In [None]:
reviews = utils.readFromDisk('reviews')
reviews_raw = utils.readFromDisk('reviews_raw')
sentiments = utils.readFromDisk('sentiments')
reviews_tokens = utils.readFromDisk('reviews_tokens')


cutoff=round(len(dfdb)*0.75)
# build train and test datasets
train_reviews = reviews[:cutoff]
train_reviews_raw = reviews_raw[:cutoff]
train_reviews_tokens = reviews_tokens[:cutoff]

train_sentiments = sentiments[:cutoff]
train_sentiments=train_sentiments.astype('int')

test_reviews = reviews[cutoff:]
test_reviews_raw = reviews_raw[cutoff:]
test_reviews_tokens = reviews_tokens[cutoff:]

test_sentiments = sentiments[cutoff:]
test_sentiments=test_sentiments.astype('int')


In [None]:
utils.writeToDisk(train_reviews,'train_reviews')
utils.writeToDisk(train_reviews_raw,'train_reviews_raw')
utils.writeToDisk(train_reviews_tokens,'train_reviews_tokens')
utils.writeToDisk(train_sentiments,'train_sentiments')

utils.writeToDisk(test_reviews,'test_reviews')
utils.writeToDisk(test_reviews_raw,'test_reviews_raw')
utils.writeToDisk(test_reviews_tokens,'test_reviews_tokens')
utils.writeToDisk(test_sentiments,'test_sentiments')


# Sample train data loaded, notice Cleaned Review

In [None]:
print('Total Rows on train dataset: ' + str(len(train_reviews)))
print('Total Rows on test dataset: ' + str(len(test_reviews)))


# Feature Engineering using BOW

In [None]:
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(train_reviews)
# transform test reviews into features
cv_test_features = cv.transform(test_reviews)

# SVM Model Training, Prediction, Performance with BOW


In [None]:
svm = SGDClassifier(loss='hinge', max_iter=100)

In [None]:
svm_bow_predictions = meu.train_predict_model(classifier=svm, train_features=cv_train_features, 
                                              train_labels=train_sentiments,test_features=cv_test_features, 
                                              test_labels=test_sentiments)
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,classes=[1, 0])


# Feature Engineering using NGRAM

In [None]:
# build ngram features on train reviews
cvn = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(2,2))
cvn_train_features = cvn.fit_transform(train_reviews)
# transform test reviews into features
cvn_test_features = cvn.transform(test_reviews)

# SVM Model Training, Prediction, Performance with NGRAM


In [None]:
svm_ngram_predictions = meu.train_predict_model(classifier=svm, train_features=cvn_train_features, train_labels=train_sentiments,test_features=cvn_test_features, test_labels=test_sentiments)
print('NGRAM model:> Train features shape:', cvn_train_features.shape, ' Test features shape:', cvn_test_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_ngram_predictions,classes=[1, 0])


# Feature Engineering using TFIDF

In [None]:
# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(train_reviews)
tv_test_features = tv.transform(test_reviews)



# SVM Model Training, Prediction,Performance with TFIDF

In [None]:
svm_tfidf_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,classes=[1, 0])



# Prediction class label encoding

In [None]:
le = LabelEncoder()
num_classes=2 
# tokenize train reviews & encode train labels
tn = ToktokTokenizer()
tokenized_train = [tn.tokenize(text)                  for text in train_reviews]
y_tr = le.fit_transform(train_sentiments)
train_sentiments_encoded = keras.utils.to_categorical(y_tr, num_classes)
utils.writeToDisk(tokenized_train,'tokenized_train')
utils.writeToDisk(train_sentiments_encoded,'train_sentiments_encoded')


# tokenize test reviews & encode test labels
tokenized_test = [tn.tokenize(text)                   for text in test_reviews]
y_ts = le.fit_transform(test_sentiments)
test_sentiments_encoded = keras.utils.to_categorical(y_ts, num_classes)
utils.writeToDisk(tokenized_train,'tokenized_test')
utils.writeToDisk(test_sentiments_encoded,'test_sentiments_encoded')

utils.writeToDisk(le,'label_encoder')

# Feature Engineering with word embeddings (Word2Vec/Gensim)

In [None]:
# build word2vec model using gensim

w2v_model = gensim.models.Word2Vec(tokenized_train,workers=NWORKERS)    

In [None]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [None]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,num_features=NUMFEATURES)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,num_features=NUMFEATURES)

# SVM Model Training, Prediction, Performance with Word2Vec


In [None]:
svm_wv_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=avg_wv_train_features, train_labels=train_sentiments,
                                                test_features=avg_wv_test_features, test_labels=test_sentiments)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_wv_predictions,classes=[1, 0])


# SVM Model Training, Prediction, Performance with GLoVe


In [None]:
# feature engineering with GloVe model
train_nlp = [nlp(item) for item in train_reviews_raw]
train_glove_features = np.array([item.vector for item in train_nlp])
utils.writeToDisk(train_glove_features,'train_glove_features')

test_nlp = [nlp(item) for item in test_reviews_raw]
test_glove_features = np.array([item.vector for item in test_nlp])
utils.writeToDisk(test_glove_features,'test_glove_features')


In [None]:
svm_glove_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=train_glove_features, train_labels=train_sentiments,
                                                test_features=test_glove_features, test_labels=test_sentiments)
print('Glove model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_glove_predictions,classes=[1, 0])



# SVM Model Training, Prediction, Performance with FastText (Reduced dataset due to memory constraints)

In [None]:
# sg decides whether to use the skip-gram model (1) or CBOW (0) 
#ft_model = FastText( tokenized_train1, size = ft_num_features, window = 50,min_count = 5,
#                     sample = 1e-3, sg = 1, iter = max_iter, workers = NWORKERS) 
ft_model = FastText( tokenized_train, size = NUMFEATURES,workers = NWORKERS) 
# generate averaged word vector features from word2vec model 
train_ft_features = averaged_word2vec_vectorizer( corpus = tokenized_train, num_features=NUMFEATURES,model = ft_model) 
test_ft_features = averaged_word2vec_vectorizer( corpus = tokenized_test,num_features=NUMFEATURES, model = ft_model)


In [None]:
svm_ft_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=train_ft_features, train_labels=train_sentiments,
                                                test_features=test_ft_features, test_labels=test_sentiments)
print('FastText:> Train features shape:', train_ft_features.shape, ' Test features shape:', test_ft_features.shape)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_ft_predictions,classes=[1, 0])





# ROC curves for SVM applied to various feature engineering methods- BOW, NGRAM, TFIDF, GLoVe, FastText

In [None]:
plt.figure(0).clf()

color = ['blue', 'orange', 'red', 'green', 'coral',
             'grey', 'indigo', 'gold', 'lime', 'olive',
             'pink', 'navy', 'magenta', 'yellow', 'tomato',
             'turquoise', 'yellowgreen', 'maroon', 'lightblue']
mbow=[]
mngram=[]
mtfidf=[]
mw2v=[]
mglove=[]
mft=[]

def metricsAndROC(pred,metricsArray,rocTitle,colorIndex):
    fpr, tpr, thresholds = metrics.roc_curve(test_sentiments, pred)
    auc = metrics.roc_auc_score(test_sentiments, pred)
    metricsArray.append(metrics.f1_score(test_sentiments, pred))
    metricsArray.append(metrics.precision_score(test_sentiments, pred))
    metricsArray.append(metrics.accuracy_score(test_sentiments, pred))
    metricsArray.append(metrics.recall_score(test_sentiments, pred))
    
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr,color=color[colorIndex], label=rocTitle.format(auc))


def metricsAndROC0(pred,metricsArray,rocTitle,colorIndex):
    fpr, tpr, thresholds = metrics.roc_curve(test_sentiments, pred)
    auc = metrics.roc_auc_score(test_sentiments, pred)
    metricsArray.append(metrics.f1_score(test_sentiments, pred))
    metricsArray.append(metrics.precision_score(test_sentiments, pred))
    metricsArray.append(metrics.accuracy_score(test_sentiments, pred))
    metricsArray.append(metrics.recall_score(test_sentiments, pred))
    plt.plot(fpr, tpr,color=color[colorIndex], label=rocTitle)

metricsAndROC(svm_bow_predictions,mbow,'SVM on BOW',0)
metricsAndROC(svm_ngram_predictions,mngram,'SVM on NGRAM',1)
metricsAndROC(svm_tfidf_predictions,mtfidf,'SVM on TFIDF',2)
metricsAndROC(svm_wv_predictions,mw2v,'SVM on W2Vec(Gensim)',3)
metricsAndROC(svm_glove_predictions,mglove,'SVM on Glove',4)
metricsAndROC(svm_ft_predictions,mft,'SVM on FastText',5)

#show the roc curve now
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
# show the legend
plt.legend(loc='best')
plt.show()

# Metrics comparison for SVM applied to various feature engineering methods- BOW, NGRAM, TFIDF, GLoVe, FastText

In [None]:
n_groups = 4
index = np.arange(n_groups)
bar_width = .1

plt.bar(index,mbow, bar_width, color=color[0], label='BOW')

z=index + bar_width
plt.bar(z, mngram, bar_width, color=color[1],label='NGRAM')



z=z+ bar_width
plt.bar(z, mtfidf, bar_width, color=color[2], label='TFIDF')

z=z+ bar_width
plt.bar(z,mw2v , bar_width, color=color[3], label='W2V')

z=z+ bar_width
plt.bar(z,mglove , bar_width,color=color[4], label='Glove')

z=z+ bar_width
plt.bar(z,mft , bar_width, color=color[5], label='FastText')


#ax.set_xlabel('Metric')
#ax.set_ylabel('Value')
#ax.set_title('Comparison of Feature Engineering Models on Amazon Reviews')
#ax.set_xticks(index + bar_width / 2)
pltLabels=['F1','PRECISION','ACCURACY','RECALL']
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + bar_width for r in range(n_groups)], pltLabels)
 
# Create legend & Show graphic
plt.legend(frameon=False,ncol=3, loc='lower left')
plt.show()



# Similar words and Visualize word embeddings

In [None]:

# view similar words based on gensim's model
similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['battery', 'screen', 'time', 'camera', 'mobile', 'app', 'price']}
similar_words


In [None]:
# visualize embeddings by Word2Vec using Gensim

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = w2v_model.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

