In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, MaxPooling1D, Flatten
import pandas as pd
from preprocessing import *
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import numpy as np

In [3]:
glove_twitter_25 = []

In [4]:
# print("loading pre trained embeddings, this can take some minutes...")
# glove_twitter_25 = KeyedVectors.load_word2vec_format('glove-twitter-25.txt', binary=False)
# print("loading complete.")

In [5]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2196, 5)


In [6]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [7]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [8]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [32]:
new_text, new_sent_tok = tokenize_list_of_text(X_temp, custom_stopwords, False, 2)

total number of types extracted is: 1798


In [33]:
# negative_reviews = list(dataset[dataset["feedback"] == 0]["verified_reviews"].values)
# print(len(negative_reviews))
# negative_text, negative_tok = tokenize_list_of_text(negative_reviews, custom_stopwords, False, pos_list)

In [34]:
# negative_artifical = generate_samples(negative_tok, 100, pre_trained_model = glove_twitter_25)
# new_sent_tok.extend(negative_artifical)
# len(negative_artifical)

In [35]:
cleaned_reviews = frequency_cleaning(new_sent_tok, 2)
# negative_artificial_cleaned = cleaned_reviews[-len(negative_artifical):] # estraggo le recensioni artificiali per poi aggiungerle esclusivamente al train
# del cleaned_reviews[-len(negative_artifical):] # le elimino dalle recensioni pulite

In [36]:
bigrams = Phrases(cleaned_reviews, scoring="npmi", threshold=0.60) #estrae le collocazioni tramite PMI
bigrams[cleaned_reviews][2]

['little', 'feature']

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(bigrams[cleaned_reviews], y, test_size=0.20, random_state=10)
# X_train.extend(negative_artificial_cleaned)
# Y_train.extend([0 for x in range(len(negative_artificial_cleaned))])

In [38]:
t = Tokenizer()
t.fit_on_texts(X_train)
X_train_encoded = t.texts_to_sequences(X_train)
max_length = len(max(bigrams[cleaned_reviews], key = len))
Xtrain = pad_sequences(X_train_encoded, maxlen=max_length, padding='post')

In [39]:
X_test_encoded = t.texts_to_sequences(X_test)
Xtest = pad_sequences(X_test_encoded, maxlen=max_length, padding='post')

In [40]:
vocab_size = len(t.word_index) + 1
vocab_size

618

In [41]:
from imblearn.under_sampling import TomekLinks

undersampler_nn = TomekLinks(sampling_strategy="all") # use tomeLinks to remove ambiguous data through nearest neighbours


# Fit and transform the X and y data
# X_train_resampled, y_train_resampled = undersampler_nn.fit_resample(Xtrain, Y_train)
X_train_resampled, y_train_resampled = Xtrain, Y_train


print('Resampled dataset shape %s' % Counter(y_train_resampled))

Resampled dataset shape Counter({1: 329, 0: 165})


In [42]:
from keras.utils import to_categorical
Y_train_hot = to_categorical(y_train_resampled)
Y_test_hot = to_categorical(Y_test)

In [43]:
from keras.callbacks import EarlyStopping

# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='binary_accuracy', patience=25)

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length, trainable = True))
model.add(Dropout(0.2))
model.add(Conv1D(filters=16, kernel_size=10, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          61800     
                                                                 
 dropout_10 (Dropout)        (None, 100, 100)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 91, 16)            16016     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 45, 16)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 720)               0         
                                                                 
 dropout_11 (Dropout)        (None, 720)               0         
                                                      

In [56]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
# fit network
model.fit(X_train_resampled, Y_train_hot, epochs=200, verbose=2, validation_split=0.2, callbacks = [early_stopping])

Epoch 1/200
13/13 - 2s - loss: 0.0162 - binary_accuracy: 0.9873 - val_loss: 2.2864 - val_binary_accuracy: 0.8081 - 2s/epoch - 138ms/step
Epoch 2/200
13/13 - 0s - loss: 0.0145 - binary_accuracy: 0.9899 - val_loss: 2.4763 - val_binary_accuracy: 0.7980 - 299ms/epoch - 23ms/step
Epoch 3/200
13/13 - 0s - loss: 0.0374 - binary_accuracy: 0.9848 - val_loss: 2.4035 - val_binary_accuracy: 0.8081 - 286ms/epoch - 22ms/step
Epoch 4/200
13/13 - 0s - loss: 0.0192 - binary_accuracy: 0.9873 - val_loss: 1.8731 - val_binary_accuracy: 0.8283 - 280ms/epoch - 22ms/step
Epoch 5/200
13/13 - 0s - loss: 0.0161 - binary_accuracy: 0.9873 - val_loss: 1.8035 - val_binary_accuracy: 0.8333 - 276ms/epoch - 21ms/step
Epoch 6/200
13/13 - 0s - loss: 0.0157 - binary_accuracy: 0.9861 - val_loss: 1.9120 - val_binary_accuracy: 0.8283 - 277ms/epoch - 21ms/step
Epoch 7/200
13/13 - 0s - loss: 0.0160 - binary_accuracy: 0.9873 - val_loss: 2.0681 - val_binary_accuracy: 0.8283 - 269ms/epoch - 21ms/step
Epoch 8/200
13/13 - 0s - loss

<keras.callbacks.History at 0x282c248fb20>

In [57]:
predictions = model.predict(Xtest)



In [58]:
predictions = np.round(predictions)

In [59]:
from sklearn.metrics import classification_report
print(classification_report(Y_test_hot, predictions))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71        41
           1       0.86      0.86      0.86        83

   micro avg       0.81      0.81      0.81       124
   macro avg       0.78      0.78      0.78       124
weighted avg       0.81      0.81      0.81       124
 samples avg       0.81      0.81      0.81       124



In [89]:
import gensim
print(gensim.__version__)

4.2.0
