In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
import gensim
from gensim.models import KeyedVectors
import numpy as np
from collections import Counter
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases
from nltk import sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
from preprocessing import *

In [2]:
w2v_model = Word2Vec.load("w2vPreTrained")

In [3]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2196, 5)


In [4]:
print(dataset["feedback"].value_counts())

1    1990
0     206
Name: feedback, dtype: int64


In [5]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [6]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [7]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [8]:
new_text, new_sent_tok = tokenize_list_of_text(X_temp, custom_stopwords, False, 2)

total number of types extracted is: 1798


In [9]:
cleaned_reviews = frequency_cleaning(new_sent_tok, 2)

In [10]:
bigrams = Phrases(cleaned_reviews, scoring="npmi", threshold=0.60) #estrae le collocazioni tramite PMI

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(bigrams[cleaned_reviews], y, test_size=0.20, random_state=10)

In [13]:
negative_reviews = []
for rev, label in zip(X_train, Y_train):
    if label == 0:
        negative_reviews.append(rev)
    
generated_reviews = generate_samples(negative_reviews, int(len(negative_reviews)/2), w2v_model)

# run this only one time
X_train.extend(generated_reviews)
Y_train.extend([0 for x in generated_reviews])

['joke']
-
['laugh']
--------------------------
['price', 'product', 'nice', 'quality', 'nice', 'feature', 'definitely', 'reason', 'give', 'think', 'may', 'buyer', 'error', 'first', 'ignore', 'product', 'plug', 'time', 'work', 'really', 'unlike', 'not_a', 'stand', 'device', 'also', 'speaker', 'not_very', 'loud', 'buy', 'bluetooth', 'speaker', 'sure', 'lot', 'figure', 'use', 'kind', 'seem', 'like', 'device', 'plus', 'set', 'awful', 'would', 'definitely', 'money', 'buy', 'one', 'actually']
-
['cost', 'product', 'decent', 'quality', 'decent', 'feature', 'definitely', 'reason', 'present', 'imagine', 'may', 'buyer', 'error', 'start', 'ignore', 'product', 'plug', 'sentence', 'run', 'truly', 'different', 'not_a', 'base', 'device', 'also', 'speaker', 'not_very', 'loudly', 'purchase', 'bluetooth', 'speaker', 'sure', 'plenty', 'figure', 'utilize', 'kind', 'seem', 'wish', 'device', 'plus', 'adjust', 'terrible', 'would', 'definitely', 'money', 'purchase', 'one', 'really']
-------------------------

In [14]:
print('Train set is %s' % Counter(Y_train))
print('Test set is %s' % Counter(Y_test))

Train set is Counter({1: 329, 0: 241})
Test set is Counter({1: 83, 0: 41})


In [15]:
t = Tokenizer(lower = False)
t.fit_on_texts(X_train)
X_train_encoded = t.texts_to_sequences(X_train)
max_length = len(max(bigrams[cleaned_reviews], key = len))
Xtrain = pad_sequences(X_train_encoded, maxlen=max_length, padding='post')

In [16]:
X_test_encoded = t.texts_to_sequences(X_test)
Xtest = pad_sequences(X_test_encoded, maxlen=max_length, padding='post')

In [17]:
vocab_size = len(t.word_index) + 1

In [18]:
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, w2v_model.vector_size))
    # step vocab, store vectors using the Tokenizer's integer mapping
    not_found = 0
    not_found_list = []
    for word, i in vocab.items():
        try:
            vector = embedding[word]
            weight_matrix[i] = vector
        except KeyError:
            weight_matrix[i] = np.zeros((1, w2v_model.vector_size))
            not_found+=1
            not_found_list.append(word)
            continue

    print(not_found_list)
    print(not_found)
    return weight_matrix

In [19]:
embedding_weights = get_weight_matrix(w2v_model.wv, t.word_index)

['play_music', 'not_a', 'differ', 'sound_quality', 'not_the', 'send_back', 'still_learn', 'listen_music', 'light_bulb', "'ve", 'even_though', "'re", 'uncertain', 'answer_question', 'play_radio', 'homescreen', 'unmake', 'look_forward', 'not_very', 'unretentive', 'not_that', 'video_chat', 'alarm_clock', 'forbid', 'not_this', 'make_life', "'ll", 'not_i', 'unaware', 'trailer', 'certified', 'not_to', 'excitement', 'not_it']
34


In [20]:
from keras.layers import Flatten

In [22]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(Y_train)

In [23]:
print(Y_train[0], Y_train_hot[0])
print(Y_train[1], Y_train_hot[1])

1 [0. 1.]
0 [1. 0.]


In [24]:
Y_test_hot = to_categorical(Y_test)

In [65]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import Embedding, Conv1D, MaxPooling1D

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='binary_accuracy', patience=25)

model = Sequential()
model.add(Embedding(vocab_size, w2v_model.vector_size, input_length=max_length, trainable = True, weights = [embedding_weights]))
model.add(Dropout(0.2))
model.add(Conv1D(filters=16, kernel_size=10, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))
print(model.summary())


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          69900     
                                                                 
 dropout_15 (Dropout)        (None, 100, 100)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 91, 16)            16016     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 45, 16)           0         
 1D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 720)               0         
                                                                 
 dropout_16 (Dropout)        (None, 720)               0         
                                                      

In [66]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
# fit network
# better to fit multiple times
model.fit(Xtrain, Y_train_hot, epochs=50, verbose=2, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/50
15/15 - 3s - loss: 0.7042 - binary_accuracy: 0.6129 - val_loss: 0.7071 - val_binary_accuracy: 0.4079 - 3s/epoch - 198ms/step
Epoch 2/50
15/15 - 0s - loss: 0.6320 - binary_accuracy: 0.6743 - val_loss: 0.8403 - val_binary_accuracy: 0.3377 - 433ms/epoch - 29ms/step
Epoch 3/50
15/15 - 0s - loss: 0.5553 - binary_accuracy: 0.7127 - val_loss: 0.7485 - val_binary_accuracy: 0.4649 - 426ms/epoch - 28ms/step
Epoch 4/50
15/15 - 0s - loss: 0.4195 - binary_accuracy: 0.7895 - val_loss: 0.6334 - val_binary_accuracy: 0.6009 - 436ms/epoch - 29ms/step
Epoch 5/50
15/15 - 0s - loss: 0.3065 - binary_accuracy: 0.8640 - val_loss: 0.4078 - val_binary_accuracy: 0.8333 - 435ms/epoch - 29ms/step
Epoch 6/50
15/15 - 0s - loss: 0.2349 - binary_accuracy: 0.9035 - val_loss: 0.4670 - val_binary_accuracy: 0.8026 - 421ms/epoch - 28ms/step
Epoch 7/50
15/15 - 0s - loss: 0.1856 - binary_accuracy: 0.9101 - val_loss: 0.4690 - val_binary_accuracy: 0.8158 - 444ms/epoch - 30ms/step
Epoch 8/50
15/15 - 0s - loss: 0.1509

<keras.callbacks.History at 0x1b39ba4f430>

In [67]:
predictions = model.predict(Xtest)



In [68]:
predictions.shape

(124, 2)

In [69]:
predictions = np.round(predictions)

In [70]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_hot, predictions))

              precision    recall  f1-score   support

           0       0.82      0.76      0.78        41
           1       0.86      0.92      0.89        83

   micro avg       0.85      0.86      0.86       124
   macro avg       0.84      0.84      0.84       124
weighted avg       0.85      0.86      0.85       124
 samples avg       0.85      0.86      0.86       124



In [71]:
x = ["Bad user experience, slow communication and the sound is not strong"]
review, temp = tokenize_list_of_text(x, custom_stopwords, False, 2)
print(temp)
seq_review = t.texts_to_sequences(temp)
padded_review = pad_sequences(seq_review, maxlen=max_length, padding='post')
preds = np.round(model.predict(padded_review))
preds

total number of types extracted is: 7
[['bad', 'user', 'experience', 'slow', 'communication', 'sound', 'weak']]


array([[1., 0.]], dtype=float32)

In [72]:
model.predict(padded_review)



array([[9.9986237e-01, 1.5549839e-04]], dtype=float32)

In [73]:
# [1, 0] is equal to negative label